diff --git a/packages/kokkos/.github/workflows/continuous-integration-workflow.yml b/packages/kokkos/.github/workflows/continuous-integration-workflow.yml index b2b4bfc3109d9aa10966b2866c472fa4d8457c5f..4ae42bc93ff649401818962ee7e4f16c2c468cb2 100644 --- a/packages/kokkos/.github/workflows/continuous-integration-workflow.yml +++ b/packages/kokkos/.github/workflows/continuous-integration-workflow.yml @@ -10,7 +10,7 @@ jobs: continue-on-error: true strategy: matrix: - distro: ['fedora:latest', 'fedora:rawhide', 'ubuntu:latest'] + distro: ['fedora:latest', 'ubuntu:latest'] cxx: ['g++', 'clang++'] cmake_build_type: ['Release', 'Debug'] backend: ['OPENMP'] diff --git a/packages/kokkos/.gitrepo b/packages/kokkos/.gitrepo index 9b53d527695cc0db4672b75762f449344c305baa..4c3eed6dba9d2682fd85d2185169ac1f5c8e17a6 100644 --- a/packages/kokkos/.gitrepo +++ b/packages/kokkos/.gitrepo @@ -6,7 +6,7 @@ [subrepo] remote = git@github.com:kokkos/kokkos.git branch = master - commit = 61d7db55fceac3318c987a291f77b844fd94c165 - parent = 91d53e3cfb9a55832aae102ca677044a47f2515d + commit = 5ad609661e570ba6aa7716a26a91cb67d559f8a2 + parent = db24c19be339723f3a10fffce3075fd72dccdfeb method = merge - cmdver = 0.4.3 + cmdver = 0.4.5 diff --git a/packages/kokkos/.jenkins b/packages/kokkos/.jenkins index 09052840e6a7ea4115c1be9abb3f2aa2e644eabd..3025cb558eab0c10d6be328b199515807fbba03b 100644 --- a/packages/kokkos/.jenkins +++ b/packages/kokkos/.jenkins @@ -347,7 +347,7 @@ pipeline { dockerfile { filename 'Dockerfile.nvcc' dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.0-devel --build-arg ADDITIONAL_PACKAGES="g++-8 gfortran clang" --build-arg CMAKE_VERSION=3.17.3' + additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.0.3-devel-ubuntu18.04 --build-arg ADDITIONAL_PACKAGES="g++-8 gfortran clang" --build-arg CMAKE_VERSION=3.17.3' label 'nvidia-docker' args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES' } diff --git a/packages/kokkos/CHANGELOG.md b/packages/kokkos/CHANGELOG.md index e81f2944519e1b39b31e1c9d7332b3aa6cb8d45e..bdbc75604bab5fbbfd436767531ab30371cc788b 100644 --- a/packages/kokkos/CHANGELOG.md +++ b/packages/kokkos/CHANGELOG.md @@ -1,5 +1,27 @@ # Change Log +## [3.7.01](https://github.com/kokkos/kokkos/tree/3.7.01) (2022-12-01) +[Full Changelog](https://github.com/kokkos/kokkos/compare/3.7.00...3.7.01) + +### Bug Fixes: +- Add fences to all sorting routines not taking an execution space instance argument [\#5547](https://github.com/kokkos/kokkos/pull/5547) +- Fix repeated `team_reduce` without barrier [\#5552](https://github.com/kokkos/kokkos/pull/5552) +- Fix memory spaces in `create_mirror_view` overloads using `view_alloc` [\#5521](https://github.com/kokkos/kokkos/pull/5521) +- Allow `as_view_of_rank_n()` to be overloaded for "special" scalar types [\#5553](https://github.com/kokkos/kokkos/pull/5553) +- Fix warning calling a `__host__` function from a `__host__ __device__` from `View:: as_view_of_rank_n` [\#5591](https://github.com/kokkos/kokkos/pull/5591) +- OpenMPTarget: adding implementation to set device id. [\#5557](https://github.com/kokkos/kokkos/pull/5557) +- Use `Kokkos::atomic_load` to Correct Race Condition Giving Rise to Seg Faulting Error in OpenMP tests [\#5559](https://github.com/kokkos/kokkos/pull/5559) +- cmake: define `KOKKOS_ARCH_A64FX` [\#5561](https://github.com/kokkos/kokkos/pull/5561) +- Only link against libatomic in gnu-make OpenMPTarget build [\#5565](https://github.com/kokkos/kokkos/pull/5565) +- Fix static extents assignment for LayoutLeft/LayoutRight assignment [\#5566](https://github.com/kokkos/kokkos/pull/5566) +- Do not add -cuda to the link line with NVHPC compiler when the CUDA backend is not actually enabled [\#5569](https://github.com/kokkos/kokkos/pull/5569) +- Export the flags in `KOKKOS_AMDGPU_OPTIONS` when using Trilinos [\#5571](https://github.com/kokkos/kokkos/pull/5571) +- Add support for detecting MPI local rank with MPICH and PMI [\#5570](https://github.com/kokkos/kokkos/pull/5570) [\#5582](https://github.com/kokkos/kokkos/pull/5582) +- Remove listing of undefined TPL dependencies [\#5573](https://github.com/kokkos/kokkos/pull/5573) +- ClockTic changed to 64 bit to fix overflow on Power [\#5592](https://github.com/kokkos/kokkos/pull/5592) +- Fix incorrect offset in CUDA and HIP parallel scan for < 4 byte types [\#5607](https://github.com/kokkos/kokkos/pull/5607) +- Fix initialization of Cuda lock arrays [\#5622](https://github.com/kokkos/kokkos/pull/5622) + ## [3.7.00](https://github.com/kokkos/kokkos/tree/3.7.00) (2022-08-22) [Full Changelog](https://github.com/kokkos/kokkos/compare/3.6.01...3.7.00) @@ -102,7 +124,6 @@ - Deprecate command line arguments (other than `--help`) that are not prefixed with `kokkos-*` [\#5120](https://github.com/kokkos/kokkos/pull/5120) - Deprecate `--[kokkos-]numa` cmdline arg and `KOKKOS_NUMA` env var [\#5117](https://github.com/kokkos/kokkos/pull/5117) - Deprecate `--[kokkos-]threads` command line argument in favor of `--[kokkos-]num-threads` [\#5111](https://github.com/kokkos/kokkos/pull/5111) -- Deprecate `Kokkos::common_view_alloc_prop` [\#5059](https://github.com/kokkos/kokkos/pull/5059) - Deprecate `Kokkos::is_reducer_type` [\#4957](https://github.com/kokkos/kokkos/pull/4957) - Deprecate `OffsetView` constructors taking `index_list_type` [\#4810](https://github.com/kokkos/kokkos/pull/4810) - Deprecate overloads of `Kokkos::sort` taking a parameter `bool always_use_kokkos_sort` [\#5382](https://github.com/kokkos/kokkos/issues/5382) diff --git a/packages/kokkos/CMakeLists.txt b/packages/kokkos/CMakeLists.txt index a05bfcdb94d53e0a7d453d62909e9a5686f6cc41..7b78f29d7340499aff394302911e59c5ef120d52 100644 --- a/packages/kokkos/CMakeLists.txt +++ b/packages/kokkos/CMakeLists.txt @@ -129,7 +129,7 @@ ENDIF() set(Kokkos_VERSION_MAJOR 3) set(Kokkos_VERSION_MINOR 7) -set(Kokkos_VERSION_PATCH 00) +set(Kokkos_VERSION_PATCH 01) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") @@ -152,6 +152,7 @@ ENDIF() # but scoping issues can make it difficult GLOBAL_SET(KOKKOS_COMPILE_OPTIONS) GLOBAL_SET(KOKKOS_LINK_OPTIONS) +GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS) GLOBAL_SET(KOKKOS_CUDA_OPTIONS) GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS) GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS) @@ -228,6 +229,9 @@ IF (KOKKOS_HAS_TRILINOS) # we have to match the annoying behavior, also we have to preserve quotes # which needs another workaround. SET(KOKKOS_COMPILE_OPTIONS_TMP) + IF (KOKKOS_ENABLE_HIP) + LIST(APPEND KOKKOS_COMPILE_OPTIONS ${KOKKOS_AMDGPU_OPTIONS}) + ENDIF() FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS}) STRING(FIND "${OPTION}" " " OPTION_HAS_WHITESPACE) IF(OPTION_HAS_WHITESPACE EQUAL -1) diff --git a/packages/kokkos/Makefile.kokkos b/packages/kokkos/Makefile.kokkos index d493abbf1421973a973e93775d90ef83e502e2cd..2e32c9d53893bf552381eb618db37acaf8156822 100644 --- a/packages/kokkos/Makefile.kokkos +++ b/packages/kokkos/Makefile.kokkos @@ -2,7 +2,7 @@ KOKKOS_VERSION_MAJOR = 3 KOKKOS_VERSION_MINOR = 7 -KOKKOS_VERSION_PATCH = 00 +KOKKOS_VERSION_PATCH = 01 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial @@ -495,10 +495,6 @@ KOKKOS_LINK_FLAGS = KOKKOS_SRC = KOKKOS_HEADERS = -#ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1) - KOKKOS_LIBS += -latomic -#endif - # Generating the KokkosCore_config.h file. KOKKOS_INTERNAL_CONFIG_TMP=KokkosCore_config.tmp @@ -540,6 +536,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1) endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) + KOKKOS_LIBS += -latomic tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMPTARGET') ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_WORKAROUND_OPENMPTARGET_GCC") diff --git a/packages/kokkos/algorithms/cmake/Dependencies.cmake b/packages/kokkos/algorithms/cmake/Dependencies.cmake index 1b413106817cc6adf18dc94189203a27e641c6d5..c36b62523fadb628e970b6eccf57a9caaa317f1e 100644 --- a/packages/kokkos/algorithms/cmake/Dependencies.cmake +++ b/packages/kokkos/algorithms/cmake/Dependencies.cmake @@ -1,5 +1,5 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers - LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX + LIB_OPTIONAL_TPLS Pthread CUDA HWLOC TEST_OPTIONAL_TPLS CUSPARSE ) diff --git a/packages/kokkos/algorithms/src/Kokkos_Sort.hpp b/packages/kokkos/algorithms/src/Kokkos_Sort.hpp index ad0c2d47b6d20d2022b5c60e81e7268b53d47f13..c7be70e09a48eff3fb2bf3a7cb89be1cf1fe6664 100644 --- a/packages/kokkos/algorithms/src/Kokkos_Sort.hpp +++ b/packages/kokkos/algorithms/src/Kokkos_Sort.hpp @@ -265,8 +265,8 @@ class BinSort { //---------------------------------------- // Create the permutation vector, the bin_offset array and the bin_count // array. Can be called again if keys changed - template <class ExecutionSpace = exec_space> - void create_permute_vector(const ExecutionSpace& exec = exec_space{}) { + template <class ExecutionSpace> + void create_permute_vector(const ExecutionSpace& exec) { static_assert( Kokkos::SpaceAccessibility<ExecutionSpace, typename Space::memory_space>::accessible, @@ -297,6 +297,15 @@ class BinSort { *this); } + // Create the permutation vector, the bin_offset array and the bin_count + // array. Can be called again if keys changed + void create_permute_vector() { + Kokkos::fence("Kokkos::Binsort::create_permute_vector: before"); + exec_space e{}; + create_permute_vector(e); + e.fence("Kokkos::Binsort::create_permute_vector: after"); + } + // Sort a subset of a view with respect to the first dimension using the // permutation array template <class ExecutionSpace, class ValuesViewType> @@ -372,9 +381,10 @@ class BinSort { template <class ValuesViewType> void sort(ValuesViewType const& values, int values_range_begin, int values_range_end) const { + Kokkos::fence("Kokkos::Binsort::sort: before"); exec_space exec; sort(exec, values, values_range_begin, values_range_end); - exec.fence("Kokkos::Sort: fence after sorting"); + exec.fence("Kokkos::BinSort:sort: after"); } template <class ExecutionSpace, class ValuesViewType> @@ -641,9 +651,10 @@ std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort( template <class ViewType> void sort(ViewType const& view) { + Kokkos::fence("Kokkos::sort: before"); typename ViewType::execution_space exec; sort(exec, view); - exec.fence("Kokkos::Sort: fence after sorting"); + exec.fence("Kokkos::sort: fence after sorting"); } #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 @@ -682,6 +693,7 @@ std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort( template <class ViewType> void sort(ViewType view, size_t const begin, size_t const end) { + Kokkos::fence("Kokkos::sort: before"); typename ViewType::execution_space exec; sort(exec, view, begin, end); exec.fence("Kokkos::Sort: fence after sorting"); diff --git a/packages/kokkos/cmake/KokkosCore_config.h.in b/packages/kokkos/cmake/KokkosCore_config.h.in index 34807ac2b26228a4f0c10aa3ee5c4f7951ac235f..88ddc483786112ca0e70f418921d89ae102b9dde 100644 --- a/packages/kokkos/cmake/KokkosCore_config.h.in +++ b/packages/kokkos/cmake/KokkosCore_config.h.in @@ -66,6 +66,7 @@ #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX #cmakedefine KOKKOS_ARCH_ARMV81 #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX2 +#cmakedefine KOKKOS_ARCH_A64FX #cmakedefine KOKKOS_ARCH_AMD_AVX2 #cmakedefine KOKKOS_ARCH_AVX #cmakedefine KOKKOS_ARCH_AVX2 diff --git a/packages/kokkos/cmake/kokkos_arch.cmake b/packages/kokkos/cmake/kokkos_arch.cmake index d4c2cda651f3510bd66e9b8faff344ebf0cf666a..ef16aad047a96cfb31f3ae6c5ecaa93ff8175539 100644 --- a/packages/kokkos/cmake/kokkos_arch.cmake +++ b/packages/kokkos/cmake/kokkos_arch.cmake @@ -187,7 +187,9 @@ IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang) ELSEIF (KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) SET(CUDA_ARCH_FLAG "-gpu") GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -cuda) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -cuda) + IF (KOKKOS_ENABLE_CUDA) # FIXME ideally unreachable when CUDA not enabled + GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -cuda) + ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) SET(CUDA_ARCH_FLAG "-arch") ENDIF() diff --git a/packages/kokkos/containers/cmake/Dependencies.cmake b/packages/kokkos/containers/cmake/Dependencies.cmake index 5e29157369c9ab8cab935a1bfc4c6dad2fdd0296..1d71d8af341181f689a6a8bf63036b67584cb138 100644 --- a/packages/kokkos/containers/cmake/Dependencies.cmake +++ b/packages/kokkos/containers/cmake/Dependencies.cmake @@ -1,5 +1,5 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_REQUIRED_PACKAGES KokkosCore - LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX + LIB_OPTIONAL_TPLS Pthread CUDA HWLOC TEST_OPTIONAL_TPLS CUSPARSE ) diff --git a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp index 442f0d8617524dc0c1459bf10110891e97b3a6b2..059ce8a610d26c9072b1cd15a282364855130d9a 100644 --- a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp +++ b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp @@ -1701,7 +1701,11 @@ namespace Impl { underlying memory, to facilitate implementation of deep_copy() and other routines that are defined on View */ template <unsigned N, typename T, typename... Args> -KOKKOS_FUNCTION auto as_view_of_rank_n(DynRankView<T, Args...> v) { +KOKKOS_FUNCTION auto as_view_of_rank_n( + DynRankView<T, Args...> v, + typename std::enable_if<std::is_same< + typename ViewTraits<T, Args...>::specialize, void>::value>::type* = + nullptr) { if (v.rank() != N) { KOKKOS_IF_ON_HOST( const std::string message = @@ -2114,9 +2118,10 @@ inline auto create_mirror( namespace Impl { template <class T, class... P, class... ViewCtorArgs> inline std::enable_if_t< - std::is_same< - typename DynRankView<T, P...>::memory_space, - typename DynRankView<T, P...>::HostMirror::memory_space>::value && + !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space && + std::is_same< + typename DynRankView<T, P...>::memory_space, + typename DynRankView<T, P...>::HostMirror::memory_space>::value && std::is_same< typename DynRankView<T, P...>::data_type, typename DynRankView<T, P...>::HostMirror::data_type>::value, @@ -2128,12 +2133,13 @@ create_mirror_view(const DynRankView<T, P...>& src, template <class T, class... P, class... ViewCtorArgs> inline std::enable_if_t< - !(std::is_same< - typename DynRankView<T, P...>::memory_space, - typename DynRankView<T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename DynRankView<T, P...>::data_type, - typename DynRankView<T, P...>::HostMirror::data_type>::value), + !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space && + !(std::is_same< + typename DynRankView<T, P...>::memory_space, + typename DynRankView<T, P...>::HostMirror::memory_space>::value && + std::is_same< + typename DynRankView<T, P...>::data_type, + typename DynRankView<T, P...>::HostMirror::data_type>::value), typename DynRankView<T, P...>::HostMirror> create_mirror_view( const DynRankView<T, P...>& src, @@ -2141,29 +2147,39 @@ create_mirror_view( return Kokkos::Impl::create_mirror(src, arg_prop); } -template <class Space, class T, class... P, class... ViewCtorArgs> +template <class T, class... P, class... ViewCtorArgs, + class = std::enable_if_t< + Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>> inline std::enable_if_t< - Kokkos::is_space<Space>::value && - Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace, - typename Impl::MirrorDRViewType<Space, T, P...>::view_type> -create_mirror_view(const Space&, const Kokkos::DynRankView<T, P...>& src, + Kokkos::is_space< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space>::value && + Impl::MirrorDRViewType< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T, + P...>::is_same_memspace, + typename Impl::MirrorDRViewType< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T, + P...>::view_type> +create_mirror_view(const Kokkos::DynRankView<T, P...>& src, const typename Impl::ViewCtorProp<ViewCtorArgs...>&) { return src; } -template <class Space, class T, class... P, class... ViewCtorArgs> +template <class T, class... P, class... ViewCtorArgs, + class = std::enable_if_t< + Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>> inline std::enable_if_t< - Kokkos::is_space<Space>::value && - !Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace, - typename Impl::MirrorDRViewType<Space, T, P...>::view_type> + Kokkos::is_space< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space>::value && + !Impl::MirrorDRViewType< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T, + P...>::is_same_memspace, + typename Impl::MirrorDRViewType< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, T, + P...>::view_type> create_mirror_view( - const Space&, const Kokkos::DynRankView<T, P...>& src, + const Kokkos::DynRankView<T, P...>& src, const typename Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { - using MemorySpace = typename Space::memory_space; - using alloc_prop = Impl::ViewCtorProp<ViewCtorArgs..., MemorySpace>; - alloc_prop prop_copy(arg_prop); - - return Kokkos::Impl::create_mirror(src, prop_copy); + return Kokkos::Impl::create_mirror(src, arg_prop); } } // namespace Impl @@ -2224,9 +2240,10 @@ create_mirror_view( template <class Space, class T, class... P> inline auto create_mirror_view(Kokkos::Impl::WithoutInitializing_t wi, - const Space& space, + const Space&, const Kokkos::DynRankView<T, P...>& src) { - return Impl::create_mirror_view(space, src, Kokkos::view_alloc(wi)); + return Impl::create_mirror_view( + src, Kokkos::view_alloc(typename Space::memory_space{}, wi)); } template <class T, class... P, class... ViewCtorArgs> diff --git a/packages/kokkos/containers/src/Kokkos_DynamicView.hpp b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp index 015a75cb0b02c602db2a3bded219497c3414595c..a2b68064de13bd2b8988b6a2025bc3c9ef2c2685 100644 --- a/packages/kokkos/containers/src/Kokkos_DynamicView.hpp +++ b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp @@ -710,7 +710,7 @@ template <class Space, class T, class... P> inline auto create_mirror( const Space&, const Kokkos::Experimental::DynamicView<T, P...>& src) { return Impl::create_mirror( - src, Impl::ViewCtorProp<>{typename Space::memory_space{}}); + src, Kokkos::view_alloc(typename Space::memory_space{})); } template <class Space, class T, class... P> @@ -729,48 +729,68 @@ inline auto create_mirror( } namespace Impl { + template <class T, class... P, class... ViewCtorArgs> inline std::enable_if_t< - (std::is_same< - typename Kokkos::Experimental::DynamicView<T, P...>::memory_space, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::DynamicView<T, P...>::data_type, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::data_type>::value), + !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space && + (std::is_same< + typename Kokkos::Experimental::DynamicView<T, P...>::memory_space, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::memory_space>::value && + std::is_same< + typename Kokkos::Experimental::DynamicView<T, P...>::data_type, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::data_type>::value), typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror> -create_mirror_view( - const typename Kokkos::Experimental::DynamicView<T, P...>& src, - const Impl::ViewCtorProp<ViewCtorArgs...>&) { +create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src, + const Impl::ViewCtorProp<ViewCtorArgs...>&) { return src; } template <class T, class... P, class... ViewCtorArgs> inline std::enable_if_t< - !(std::is_same< - typename Kokkos::Experimental::DynamicView<T, P...>::memory_space, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::DynamicView<T, P...>::data_type, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::data_type>::value), + !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space && + !(std::is_same< + typename Kokkos::Experimental::DynamicView<T, P...>::memory_space, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::memory_space>::value && + std::is_same< + typename Kokkos::Experimental::DynamicView<T, P...>::data_type, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::data_type>::value), typename Kokkos::Experimental::DynamicView<T, P...>::HostMirror> create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src, const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { return Kokkos::create_mirror(arg_prop, src); } -template <class Space, class T, class... P, class... ViewCtorArgs> -inline std::enable_if_t< - Impl::MirrorDynamicViewType<Space, T, P...>::is_same_memspace, - typename Kokkos::Impl::MirrorDynamicViewType<Space, T, P...>::view_type> -create_mirror_view(const Space&, - const Kokkos::Experimental::DynamicView<T, P...>& src, +template <class T, class... P, class... ViewCtorArgs, + class = std::enable_if_t< + Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>> +std::enable_if_t<Impl::MirrorDynamicViewType< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, + T, P...>::is_same_memspace, + typename Impl::MirrorDynamicViewType< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, + T, P...>::view_type> +create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src, const Impl::ViewCtorProp<ViewCtorArgs...>&) { return src; } + +template <class T, class... P, class... ViewCtorArgs, + class = std::enable_if_t< + Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>> +std::enable_if_t<!Impl::MirrorDynamicViewType< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, + T, P...>::is_same_memspace, + typename Impl::MirrorDynamicViewType< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, + T, P...>::view_type> +create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src, + const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { + return Kokkos::Impl::create_mirror(src, arg_prop); +} } // namespace Impl // Create a mirror view in host space @@ -790,8 +810,9 @@ inline auto create_mirror_view( // Create a mirror in a new space template <class Space, class T, class... P> inline auto create_mirror_view( - const Space& space, const Kokkos::Experimental::DynamicView<T, P...>& src) { - return Impl::create_mirror_view(space, src, Impl::ViewCtorProp<>{}); + const Space&, const Kokkos::Experimental::DynamicView<T, P...>& src) { + return Impl::create_mirror_view(src, + view_alloc(typename Space::memory_space{})); } template <class Space, class T, class... P> diff --git a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp index 0b54d1bdd952f33e433f17b05c56ef415ee286b4..5027763a0297a00c2b9dfb28734da628e763d7dc 100644 --- a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp +++ b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp @@ -1901,19 +1901,22 @@ struct MirrorOffsetType { namespace Impl { template <class T, class... P, class... ViewCtorArgs> -inline typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror +inline std::enable_if_t< + !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space, + typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror> create_mirror(const Kokkos::Experimental::OffsetView<T, P...>& src, const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { return typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror( Kokkos::create_mirror(arg_prop, src.view()), src.begins()); } -template <class Space, class T, class... P, class... ViewCtorArgs> -inline typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type -create_mirror(const Space&, - const Kokkos::Experimental::OffsetView<T, P...>& src, - const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { +template <class T, class... P, class... ViewCtorArgs, + class = std::enable_if_t< + Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>> +inline auto create_mirror(const Kokkos::Experimental::OffsetView<T, P...>& src, + const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { using alloc_prop_input = Impl::ViewCtorProp<ViewCtorArgs...>; + using Space = typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space; static_assert( !alloc_prop_input::has_label, @@ -1923,10 +1926,6 @@ create_mirror(const Space&, !alloc_prop_input::has_pointer, "The view constructor arguments passed to Kokkos::create_mirror must " "not include a pointer!"); - static_assert( - !alloc_prop_input::has_memory_space, - "The view constructor arguments passed to Kokkos::create_mirror must " - "not include a memory space instance!"); static_assert( !alloc_prop_input::allow_padding, "The view constructor arguments passed to Kokkos::create_mirror must " @@ -1962,15 +1961,17 @@ inline auto create_mirror( template <class Space, class T, class... P, typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>> inline auto create_mirror( - const Space& space, const Kokkos::Experimental::OffsetView<T, P...>& src) { - return Impl::create_mirror(space, src, Impl::ViewCtorProp<>{}); + const Space&, const Kokkos::Experimental::OffsetView<T, P...>& src) { + return Impl::create_mirror( + src, Kokkos::view_alloc(typename Space::memory_space{})); } template <class Space, class T, class... P> typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type -create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space& space, +create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, const Kokkos::Experimental::OffsetView<T, P...>& src) { - return Impl::create_mirror(space, src, Kokkos::view_alloc(wi)); + return Impl::create_mirror( + src, Kokkos::view_alloc(typename Space::memory_space{}, wi)); } template <class T, class... P, class... ViewCtorArgs> @@ -1983,54 +1984,64 @@ inline auto create_mirror( namespace Impl { template <class T, class... P, class... ViewCtorArgs> inline std::enable_if_t< - (std::is_same< - typename Kokkos::Experimental::OffsetView<T, P...>::memory_space, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::memory_space>::value && - std::is_same<typename Kokkos::Experimental::OffsetView<T, P...>::data_type, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::data_type>::value), + !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space && + (std::is_same< + typename Kokkos::Experimental::OffsetView<T, P...>::memory_space, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::memory_space>::value && + std::is_same< + typename Kokkos::Experimental::OffsetView<T, P...>::data_type, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::data_type>::value), typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror> -create_mirror_view( - const typename Kokkos::Experimental::OffsetView<T, P...>& src, - const Impl::ViewCtorProp<ViewCtorArgs...>&) { +create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src, + const Impl::ViewCtorProp<ViewCtorArgs...>&) { return src; } template <class T, class... P, class... ViewCtorArgs> inline std::enable_if_t< - !(std::is_same< - typename Kokkos::Experimental::OffsetView<T, P...>::memory_space, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::Experimental::OffsetView<T, P...>::data_type, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::data_type>::value), + !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space && + !(std::is_same< + typename Kokkos::Experimental::OffsetView<T, P...>::memory_space, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::memory_space>::value && + std::is_same< + typename Kokkos::Experimental::OffsetView<T, P...>::data_type, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::data_type>::value), typename Kokkos::Experimental::OffsetView<T, P...>::HostMirror> create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src, const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { return Kokkos::create_mirror(arg_prop, src); } -template <class Space, class T, class... P, class... ViewCtorArgs> -inline std::enable_if_t< - Impl::MirrorOffsetViewType<Space, T, P...>::is_same_memspace, - Kokkos::Experimental::OffsetView<T, P...>> -create_mirror_view(const Space&, - const Kokkos::Experimental::OffsetView<T, P...>& src, +template <class T, class... P, class... ViewCtorArgs, + class = std::enable_if_t< + Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>> +std::enable_if_t<Impl::MirrorOffsetViewType< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, + T, P...>::is_same_memspace, + typename Impl::MirrorOffsetViewType< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, + T, P...>::view_type> +create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src, const Impl::ViewCtorProp<ViewCtorArgs...>&) { return src; } -template <class Space, class T, class... P, class... ViewCtorArgs> -std::enable_if_t< - !Impl::MirrorOffsetViewType<Space, T, P...>::is_same_memspace, - typename Kokkos::Impl::MirrorOffsetViewType<Space, T, P...>::view_type> -create_mirror_view(const Space& space, - const Kokkos::Experimental::OffsetView<T, P...>& src, +template <class T, class... P, class... ViewCtorArgs, + class = std::enable_if_t< + Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>> +std::enable_if_t<!Impl::MirrorOffsetViewType< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, + T, P...>::is_same_memspace, + typename Impl::MirrorOffsetViewType< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, + T, P...>::view_type> +create_mirror_view(const Kokkos::Experimental::OffsetView<T, P...>& src, const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { - return create_mirror(space, src, arg_prop); + return Kokkos::Impl::create_mirror(src, arg_prop); } } // namespace Impl @@ -2052,15 +2063,17 @@ inline auto create_mirror_view( template <class Space, class T, class... P, typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>> inline auto create_mirror_view( - const Space& space, const Kokkos::Experimental::OffsetView<T, P...>& src) { - return Impl::create_mirror_view(space, src, Impl::ViewCtorProp<>{}); + const Space&, const Kokkos::Experimental::OffsetView<T, P...>& src) { + return Impl::create_mirror_view( + src, Kokkos::view_alloc(typename Space::memory_space{})); } template <class Space, class T, class... P> inline auto create_mirror_view( - Kokkos::Impl::WithoutInitializing_t wi, const Space& space, + Kokkos::Impl::WithoutInitializing_t wi, const Space&, const Kokkos::Experimental::OffsetView<T, P...>& src) { - return Impl::create_mirror_view(space, src, Kokkos::view_alloc(wi)); + return Impl::create_mirror_view( + src, Kokkos::view_alloc(typename Space::memory_space{}, wi)); } template <class T, class... P, class... ViewCtorArgs> diff --git a/packages/kokkos/containers/unit_tests/CMakeLists.txt b/packages/kokkos/containers/unit_tests/CMakeLists.txt index f16572b60300562eabd01563ee2469cfa899bf65..261d9dcd4215d712ef7b6fca3b0ad08c9ecb0052 100644 --- a/packages/kokkos/containers/unit_tests/CMakeLists.txt +++ b/packages/kokkos/containers/unit_tests/CMakeLists.txt @@ -46,3 +46,13 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) KOKKOS_ADD_EXECUTABLE_AND_TEST(UnitTest_${Tag} SOURCES ${UnitTestSources}) endif() endforeach() + +SET(COMPILE_ONLY_SOURCES + TestCreateMirror.cpp +) +KOKKOS_ADD_EXECUTABLE( + TestCompileOnly + SOURCES + TestCompileMain.cpp + ${COMPILE_ONLY_SOURCES} +) diff --git a/packages/kokkos/containers/unit_tests/TestCompileMain.cpp b/packages/kokkos/containers/unit_tests/TestCompileMain.cpp new file mode 100644 index 0000000000000000000000000000000000000000..237c8ce181774d991a9dbdd8cacf1a5fb9f199f1 --- /dev/null +++ b/packages/kokkos/containers/unit_tests/TestCompileMain.cpp @@ -0,0 +1 @@ +int main() {} diff --git a/packages/kokkos/containers/unit_tests/TestCreateMirror.cpp b/packages/kokkos/containers/unit_tests/TestCreateMirror.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0e43be4364154393b30cd349a563d7984a5ca2f0 --- /dev/null +++ b/packages/kokkos/containers/unit_tests/TestCreateMirror.cpp @@ -0,0 +1,179 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> +#include <Kokkos_DynamicView.hpp> +#include <Kokkos_DynRankView.hpp> +#include <Kokkos_OffsetView.hpp> + +template <typename TestView, typename MemorySpace> +void check_memory_space(TestView, MemorySpace) { + static_assert( + std::is_same<typename TestView::memory_space, MemorySpace>::value, ""); +} + +template <class View> +auto host_mirror_test_space(View) { + return std::conditional_t< + Kokkos::SpaceAccessibility<Kokkos::HostSpace, + typename View::memory_space>::accessible, + typename View::memory_space, Kokkos::HostSpace>{}; +} + +template <typename View> +void test_create_mirror_properties(const View& view) { + using namespace Kokkos; + using DeviceMemorySpace = typename DefaultExecutionSpace::memory_space; + + // clang-format off + + // create_mirror +#ifndef KOKKOS_ENABLE_CXX14 + // FIXME DynamicView: HostMirror is the same type + if constexpr (!is_dynamic_view<View>::value) { + check_memory_space(create_mirror(WithoutInitializing, view), host_mirror_test_space(view)); + check_memory_space(create_mirror( view), host_mirror_test_space(view)); + } +#endif + check_memory_space(create_mirror(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{}); + check_memory_space(create_mirror( DefaultExecutionSpace{}, view), DeviceMemorySpace{}); + + // create_mirror_view +#ifndef KOKKOS_ENABLE_CXX14 + // FIXME DynamicView: HostMirror is the same type + if constexpr (!is_dynamic_view<View>::value) { + check_memory_space(create_mirror_view(WithoutInitializing, view), host_mirror_test_space(view)); + check_memory_space(create_mirror_view( view), host_mirror_test_space(view)); + } +#endif + check_memory_space(create_mirror_view(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{}); + check_memory_space(create_mirror_view( DefaultExecutionSpace{}, view), DeviceMemorySpace{}); + + // create_mirror view_alloc +#ifndef KOKKOS_ENABLE_CXX14 + // FIXME DynamicView: HostMirror is the same type + if constexpr (!is_dynamic_view<View>::value) { + check_memory_space(create_mirror(view_alloc(WithoutInitializing), view), host_mirror_test_space(view)); + check_memory_space(create_mirror(view_alloc(), view), host_mirror_test_space(view)); + } +#endif + check_memory_space(create_mirror(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{}); + check_memory_space(create_mirror(view_alloc( DeviceMemorySpace{}), view), DeviceMemorySpace{}); + + // create_mirror_view view_alloc +#ifndef KOKKOS_ENABLE_CXX14 + // FIXME DynamicView: HostMirror is the same type + if constexpr (!is_dynamic_view<View>::value) { + check_memory_space(create_mirror_view(view_alloc(WithoutInitializing), view), host_mirror_test_space(view)); + check_memory_space(create_mirror_view(view_alloc(), view), host_mirror_test_space(view)); + } +#endif + check_memory_space(create_mirror_view(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{}); + check_memory_space(create_mirror_view(view_alloc( DeviceMemorySpace{}), view), DeviceMemorySpace{}); + + // create_mirror view_alloc + execution space +#ifndef KOKKOS_ENABLE_CXX14 + // FIXME DynamicView: HostMirror is the same type + if constexpr (!is_dynamic_view<View>::value) { + check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing), view), host_mirror_test_space(view)); + check_memory_space(create_mirror(view_alloc(DefaultHostExecutionSpace{}), view), host_mirror_test_space(view)); + } +#endif + check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{}); + check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, DeviceMemorySpace{}), view), DeviceMemorySpace{}); + + // create_mirror_view view_alloc + execution space +#ifndef KOKKOS_ENABLE_CXX14 + // FIXME DynamicView: HostMirror is the same type + if constexpr (!is_dynamic_view<View>::value) { + check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing), view), host_mirror_test_space(view)); + check_memory_space(create_mirror_view(view_alloc(DefaultHostExecutionSpace{}), view), host_mirror_test_space(view)); + } +#endif + check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{}); + check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, DeviceMemorySpace{}), view), DeviceMemorySpace{}); + + // create_mirror_view_and_copy + check_memory_space(create_mirror_view_and_copy(HostSpace{}, view), HostSpace{}); + check_memory_space(create_mirror_view_and_copy(DeviceMemorySpace{}, view), DeviceMemorySpace{}); + + // create_mirror_view_and_copy view_alloc + check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}), view), HostSpace{}); + check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}), view), DeviceMemorySpace{}); + + // create_mirror_view_and_copy view_alloc + execution space + check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}, DefaultHostExecutionSpace{}), view), HostSpace{}); + check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}, DefaultExecutionSpace{}), view), DeviceMemorySpace{}); + + // clang-format on +} + +void test_create_mirror_dynrankview() { + Kokkos::DynRankView<int, Kokkos::DefaultExecutionSpace> device_view( + "device view", 10); + Kokkos::DynRankView<int, Kokkos::HostSpace> host_view("host view", 10); + + test_create_mirror_properties(device_view); + test_create_mirror_properties(host_view); +} + +void test_reate_mirror_offsetview() { + Kokkos::Experimental::OffsetView<int*, Kokkos::DefaultExecutionSpace> + device_view("device view", {0, 10}); + Kokkos::Experimental::OffsetView<int*, Kokkos::HostSpace> host_view( + "host view", {0, 10}); + + test_create_mirror_properties(device_view); + test_create_mirror_properties(host_view); +} + +void test_create_mirror_dynamicview() { + Kokkos::Experimental::DynamicView<int*, Kokkos::DefaultExecutionSpace> + device_view("device view", 2, 10); + Kokkos::Experimental::DynamicView<int*, Kokkos::HostSpace> host_view( + "host view", 2, 10); + + test_create_mirror_properties(device_view); + test_create_mirror_properties(host_view); +} diff --git a/packages/kokkos/core/cmake/Dependencies.cmake b/packages/kokkos/core/cmake/Dependencies.cmake index cc901a4ede0c6b17fbb89bfa9edfaf6544d7b269..611c089b2e3feec2ec79228360f93c242fc055e2 100644 --- a/packages/kokkos/core/cmake/Dependencies.cmake +++ b/packages/kokkos/core/cmake/Dependencies.cmake @@ -1,5 +1,5 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( - LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib HPX + LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib TEST_OPTIONAL_TPLS CUSPARSE ) diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index 88810b6fc2bbcf5c6fef3bd4a9de0a72fb30c5e8..b7a80ad84ff22b00d9666956cf5896b259d38b6a 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -636,7 +636,7 @@ struct CudaParallelLaunchImpl< DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>>( base_t::get_kernel_func(), prefer_shmem); - ensure_cuda_lock_arrays_on_device(); + KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); // Invoke the driver function on the device base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance); diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp index 3796534816a8bdb6bbeb1e517c6e54a04f2c82e1..84d4307cfd549f9567cb2bc5982a882543e19168 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp @@ -79,7 +79,8 @@ CudaLockArrays g_host_cuda_lock_arrays = {nullptr, 0}; void initialize_host_cuda_lock_arrays() { #ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS desul::Impl::init_lock_arrays(); - desul::ensure_cuda_lock_arrays_on_device(); + + DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); #endif if (g_host_cuda_lock_arrays.atomic != nullptr) return; KOKKOS_IMPL_CUDA_SAFE_CALL( @@ -88,7 +89,7 @@ void initialize_host_cuda_lock_arrays() { Impl::cuda_device_synchronize( "Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays"); g_host_cuda_lock_arrays.n = Cuda::concurrency(); - copy_cuda_lock_arrays_to_device(); + KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE(); init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>(); Impl::cuda_device_synchronize( @@ -105,7 +106,7 @@ void finalize_host_cuda_lock_arrays() { g_host_cuda_lock_arrays.atomic = nullptr; g_host_cuda_lock_arrays.n = 0; #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE - copy_cuda_lock_arrays_to_device(); + KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE(); #endif } diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp index 244f142f0d83550bf79d5cfb5288494e2629226f..bdb7723985e5a3c6c0451ada3d0b6b7303204089 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp @@ -67,7 +67,7 @@ struct CudaLockArrays { /// \brief This global variable in Host space is the central definition /// of these arrays. -extern CudaLockArrays g_host_cuda_lock_arrays; +extern Kokkos::Impl::CudaLockArrays g_host_cuda_lock_arrays; /// \brief After this call, the g_host_cuda_lock_arrays variable has /// valid, initialized arrays. @@ -105,12 +105,12 @@ namespace Impl { /// instances in other translation units, we must update this CUDA global /// variable based on the Host global variable prior to running any kernels /// that will use it. -/// That is the purpose of the ensure_cuda_lock_arrays_on_device function. +/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro. __device__ #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE __constant__ extern #endif - CudaLockArrays g_device_cuda_lock_arrays; + Kokkos::Impl::CudaLockArrays g_device_cuda_lock_arrays; #define CUDA_SPACE_ATOMIC_MASK 0x1FFFF @@ -123,7 +123,9 @@ __device__ inline bool lock_address_cuda_space(void* ptr) { size_t offset = size_t(ptr); offset = offset >> 2; offset = offset & CUDA_SPACE_ATOMIC_MASK; - return (0 == atomicCAS(&g_device_cuda_lock_arrays.atomic[offset], 0, 1)); + return ( + 0 == + atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0, 1)); } /// \brief Release lock for the address @@ -136,7 +138,7 @@ __device__ inline void unlock_address_cuda_space(void* ptr) { size_t offset = size_t(ptr); offset = offset >> 2; offset = offset & CUDA_SPACE_ATOMIC_MASK; - atomicExch(&g_device_cuda_lock_arrays.atomic[offset], 0); + atomicExch(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset], 0); } } // namespace Impl @@ -149,49 +151,45 @@ namespace { static int lock_array_copied = 0; inline int eliminate_warning_for_lock_array() { return lock_array_copied; } } // namespace +} // namespace Impl +} // namespace Kokkos -#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE -inline -#else -static -#endif - void - copy_cuda_lock_arrays_to_device() { - if (lock_array_copied == 0) { - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpyToSymbol(g_device_cuda_lock_arrays, - &g_host_cuda_lock_arrays, - sizeof(CudaLockArrays))); +/* Dan Ibanez: it is critical that this code be a macro, so that it will + capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays! + putting this in an inline function will NOT do the right thing! */ +#define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \ + { \ + if (::Kokkos::Impl::lock_array_copied == 0) { \ + KOKKOS_IMPL_CUDA_SAFE_CALL( \ + cudaMemcpyToSymbol(Kokkos::Impl::g_device_cuda_lock_arrays, \ + &Kokkos::Impl::g_host_cuda_lock_arrays, \ + sizeof(Kokkos::Impl::CudaLockArrays))); \ + } \ + lock_array_copied = 1; \ } - lock_array_copied = 1; -} #ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE -inline void ensure_cuda_lock_arrays_on_device() {} +#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() #else -inline static void ensure_cuda_lock_arrays_on_device() { - copy_cuda_lock_arrays_to_device(); -} +#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \ + KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() #endif #else #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE -inline void ensure_cuda_lock_arrays_on_device() {} +#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() #else // Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc. -inline static void ensure_cuda_lock_arrays_on_device() { - copy_cuda_lock_arrays_to_device(); - desul::ensure_cuda_lock_arrays_on_device(); -} +#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \ + KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \ + DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() #endif #endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */ -} // namespace Impl -} // namespace Kokkos - #endif /* defined( KOKKOS_ENABLE_CUDA ) */ #endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */ diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp index 98733430063d98815ececb0ac9fcf83592a4e681..ac160f8fe268a42e04eebcee2639160e7edbd512 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp @@ -465,8 +465,24 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> { public: using pointer_type = typename Analysis::pointer_type; using reference_type = typename Analysis::reference_type; + using value_type = typename Analysis::value_type; using functor_type = FunctorType; using size_type = Cuda::size_type; + // Conditionally set word_size_type to int16_t or int8_t if value_type is + // smaller than int32_t (Kokkos::Cuda::size_type) + // word_size_type is used to determine the word count, shared memory buffer + // size, and global memory buffer size before the scan is performed. + // Within the scan, the word count is recomputed based on word_size_type + // and when calculating indexes into the shared/global memory buffers for + // performing the scan, word_size_type is used again. + // For scalars > 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + using word_size_type = std::conditional_t< + sizeof(value_type) < sizeof(size_type), + std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>; private: // Algorithmic constraints: @@ -477,7 +493,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> { const FunctorType m_functor; const Policy m_policy; - size_type* m_scratch_space; + word_size_type* m_scratch_space; size_type* m_scratch_flags; size_type m_final; #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION @@ -501,12 +517,12 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> { __device__ inline void initial() const { typename Analysis::Reducer final_reducer(&m_functor); - const integral_nonzero_constant<size_type, Analysis::StaticValueSize / - sizeof(size_type)> - word_count(Analysis::value_size(m_functor) / sizeof(size_type)); + const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize / + sizeof(word_size_type)> + word_count(Analysis::value_size(m_functor) / sizeof(word_size_type)); - size_type* const shared_value = - kokkos_impl_cuda_shared_memory<size_type>() + + word_size_type* const shared_value = + kokkos_impl_cuda_shared_memory<word_size_type>() + word_count.value * threadIdx.y; final_reducer.init(reinterpret_cast<pointer_type>(shared_value)); @@ -532,7 +548,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> { // gridDim.x cuda_single_inter_block_reduce_scan<true>( final_reducer, blockIdx.x, gridDim.x, - kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space, + kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space, m_scratch_flags); } @@ -541,21 +557,22 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> { __device__ inline void final() const { typename Analysis::Reducer final_reducer(&m_functor); - const integral_nonzero_constant<size_type, Analysis::StaticValueSize / - sizeof(size_type)> - word_count(Analysis::value_size(m_functor) / sizeof(size_type)); + const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize / + sizeof(word_size_type)> + word_count(Analysis::value_size(m_functor) / sizeof(word_size_type)); // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , // value[2] , ... } - size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>(); - size_type* const shared_prefix = + word_size_type* const shared_data = + kokkos_impl_cuda_shared_memory<word_size_type>(); + word_size_type* const shared_prefix = shared_data + word_count.value * threadIdx.y; - size_type* const shared_accum = + word_size_type* const shared_accum = shared_data + word_count.value * (blockDim.y + 1); // Starting value for this thread block is the previous block's total. if (blockIdx.x) { - size_type* const block_total = + word_size_type* const block_total = m_scratch_space + word_count.value * (blockIdx.x - 1); for (unsigned i = threadIdx.y; i < word_count.value; ++i) { shared_accum[i] = block_total[i]; @@ -602,7 +619,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> { typename Analysis::pointer_type(shared_data + word_count.value)); { - size_type* const block_total = + word_size_type* const block_total = shared_data + word_count.value * blockDim.y; for (unsigned i = threadIdx.y; i < word_count.value; ++i) { shared_accum[i] = block_total[i]; @@ -690,8 +707,9 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> { // How many block are really needed for this much work: const int grid_x = (nwork + work_per_block - 1) / work_per_block; - m_scratch_space = cuda_internal_scratch_space( - m_policy.space(), Analysis::value_size(m_functor) * grid_x); + m_scratch_space = + reinterpret_cast<word_size_type*>(cuda_internal_scratch_space( + m_policy.space(), Analysis::value_size(m_functor) * grid_x)); m_scratch_flags = cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1); @@ -752,10 +770,26 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, Policy, FunctorType>; public: + using value_type = typename Analysis::value_type; using pointer_type = typename Analysis::pointer_type; using reference_type = typename Analysis::reference_type; using functor_type = FunctorType; using size_type = Cuda::size_type; + // Conditionally set word_size_type to int16_t or int8_t if value_type is + // smaller than int32_t (Kokkos::Cuda::size_type) + // word_size_type is used to determine the word count, shared memory buffer + // size, and global memory buffer size before the scan is performed. + // Within the scan, the word count is recomputed based on word_size_type + // and when calculating indexes into the shared/global memory buffers for + // performing the scan, word_size_type is used again. + // For scalars > 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + using word_size_type = std::conditional_t< + sizeof(value_type) < sizeof(size_type), + std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>; private: // Algorithmic constraints: @@ -766,7 +800,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, const FunctorType m_functor; const Policy m_policy; - size_type* m_scratch_space; + word_size_type* m_scratch_space; size_type* m_scratch_flags; size_type m_final; ReturnType& m_returnvalue; @@ -791,12 +825,12 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, __device__ inline void initial() const { typename Analysis::Reducer final_reducer(&m_functor); - const integral_nonzero_constant<size_type, Analysis::StaticValueSize / - sizeof(size_type)> - word_count(Analysis::value_size(m_functor) / sizeof(size_type)); + const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize / + sizeof(word_size_type)> + word_count(Analysis::value_size(m_functor) / sizeof(word_size_type)); - size_type* const shared_value = - kokkos_impl_cuda_shared_memory<size_type>() + + word_size_type* const shared_value = + kokkos_impl_cuda_shared_memory<word_size_type>() + word_count.value * threadIdx.y; final_reducer.init(reinterpret_cast<pointer_type>(shared_value)); @@ -822,7 +856,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, // gridDim.x cuda_single_inter_block_reduce_scan<true>( final_reducer, blockIdx.x, gridDim.x, - kokkos_impl_cuda_shared_memory<size_type>(), m_scratch_space, + kokkos_impl_cuda_shared_memory<word_size_type>(), m_scratch_space, m_scratch_flags); } @@ -831,21 +865,22 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, __device__ inline void final() const { typename Analysis::Reducer final_reducer(&m_functor); - const integral_nonzero_constant<size_type, Analysis::StaticValueSize / - sizeof(size_type)> - word_count(Analysis::value_size(m_functor) / sizeof(size_type)); + const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize / + sizeof(word_size_type)> + word_count(Analysis::value_size(m_functor) / sizeof(word_size_type)); // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , // value[2] , ... } - size_type* const shared_data = kokkos_impl_cuda_shared_memory<size_type>(); - size_type* const shared_prefix = + word_size_type* const shared_data = + kokkos_impl_cuda_shared_memory<word_size_type>(); + word_size_type* const shared_prefix = shared_data + word_count.value * threadIdx.y; - size_type* const shared_accum = + word_size_type* const shared_accum = shared_data + word_count.value * (blockDim.y + 1); // Starting value for this thread block is the previous block's total. if (blockIdx.x) { - size_type* const block_total = + word_size_type* const block_total = m_scratch_space + word_count.value * (blockIdx.x - 1); for (unsigned i = threadIdx.y; i < word_count.value; ++i) { shared_accum[i] = block_total[i]; @@ -894,7 +929,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, typename Analysis::pointer_type(shared_data + word_count.value)); { - size_type* const block_total = + word_size_type* const block_total = shared_data + word_count.value * blockDim.y; for (unsigned i = threadIdx.y; i < word_count.value; ++i) { shared_accum[i] = block_total[i]; @@ -983,8 +1018,9 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, // How many block are really needed for this much work: const int grid_x = (nwork + work_per_block - 1) / work_per_block; - m_scratch_space = cuda_internal_scratch_space( - m_policy.space(), Analysis::value_size(m_functor) * grid_x); + m_scratch_space = + reinterpret_cast<word_size_type*>(cuda_internal_scratch_space( + m_policy.space(), Analysis::value_size(m_functor) * grid_x)); m_scratch_flags = cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type) * 1); @@ -1022,7 +1058,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, #endif DeepCopy<HostSpace, CudaSpace, Cuda>( m_policy.space(), &m_returnvalue, - m_scratch_space + (grid_x - 1) * size / sizeof(int), size); + m_scratch_space + (grid_x - 1) * size / sizeof(word_size_type), + size); } } diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp index 078315b65dd20d37adc6973f5ffff3a94836236b..34d4bef9fdaaf038a2fcdab257d043438a348887 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp @@ -116,6 +116,7 @@ __device__ inline void cuda_inter_warp_reduction( value = result[0]; for (int i = 1; (i * step < max_active_thread) && i < STEP_WIDTH; i++) reducer.join(&value, &result[i]); + __syncthreads(); } template <class ValueType, class ReducerType> diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp index 5c871e0d615fc58bb01b93566bc0ab7a0ad892b2..dca1fb9073e6de4f5889e0ae61c0f5a5787254de 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp @@ -448,11 +448,27 @@ class ParallelScanHIPBase { Policy, FunctorType>; public: + using value_type = typename Analysis::value_type; using pointer_type = typename Analysis::pointer_type; using reference_type = typename Analysis::reference_type; using functor_type = FunctorType; using size_type = Kokkos::Experimental::HIP::size_type; using index_type = typename Policy::index_type; + // Conditionally set word_size_type to int16_t or int8_t if value_type is + // smaller than int32_t (Kokkos::HIP::size_type) + // word_size_type is used to determine the word count, shared memory buffer + // size, and global memory buffer size before the scan is performed. + // Within the scan, the word count is recomputed based on word_size_type + // and when calculating indexes into the shared/global memory buffers for + // performing the scan, word_size_type is used again. + // For scalars > 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + using word_size_type = std::conditional_t< + sizeof(value_type) < sizeof(size_type), + std::conditional_t<sizeof(value_type) == 2, int16_t, int8_t>, size_type>; protected: // Algorithmic constraints: @@ -463,10 +479,10 @@ class ParallelScanHIPBase { const FunctorType m_functor; const Policy m_policy; - size_type* m_scratch_space = nullptr; - size_type* m_scratch_flags = nullptr; - size_type m_final = false; - int m_grid_x = 0; + word_size_type* m_scratch_space = nullptr; + size_type* m_scratch_flags = nullptr; + size_type m_final = false; + int m_grid_x = 0; // Only let one ParallelReduce/Scan modify the shared memory. The // constructor acquires the mutex which is released in the destructor. std::lock_guard<std::mutex> m_shared_memory_lock; @@ -489,12 +505,12 @@ class ParallelScanHIPBase { __device__ inline void initial() const { typename Analysis::Reducer final_reducer(&m_functor); - const integral_nonzero_constant<size_type, Analysis::StaticValueSize / - sizeof(size_type)> - word_count(Analysis::value_size(m_functor) / sizeof(size_type)); + const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize / + sizeof(word_size_type)> + word_count(Analysis::value_size(m_functor) / sizeof(word_size_type)); pointer_type const shared_value = reinterpret_cast<pointer_type>( - Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>() + + Kokkos::Experimental::kokkos_impl_hip_shared_memory<word_size_type>() + word_count.value * threadIdx.y); final_reducer.init(shared_value); @@ -518,7 +534,7 @@ class ParallelScanHIPBase { // gridDim.x hip_single_inter_block_reduce_scan<true>( final_reducer, blockIdx.x, gridDim.x, - Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>(), + Kokkos::Experimental::kokkos_impl_hip_shared_memory<word_size_type>(), m_scratch_space, m_scratch_flags); } @@ -527,22 +543,22 @@ class ParallelScanHIPBase { __device__ inline void final() const { typename Analysis::Reducer final_reducer(&m_functor); - const integral_nonzero_constant<size_type, Analysis::StaticValueSize / - sizeof(size_type)> - word_count(Analysis::value_size(m_functor) / sizeof(size_type)); + const integral_nonzero_constant<word_size_type, Analysis::StaticValueSize / + sizeof(word_size_type)> + word_count(Analysis::value_size(m_functor) / sizeof(word_size_type)); // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , // value[2] , ... } - size_type* const shared_data = - Kokkos::Experimental::kokkos_impl_hip_shared_memory<size_type>(); - size_type* const shared_prefix = + word_size_type* const shared_data = + Kokkos::Experimental::kokkos_impl_hip_shared_memory<word_size_type>(); + word_size_type* const shared_prefix = shared_data + word_count.value * threadIdx.y; - size_type* const shared_accum = + word_size_type* const shared_accum = shared_data + word_count.value * (blockDim.y + 1); // Starting value for this thread block is the previous block's total. if (blockIdx.x) { - size_type* const block_total = + word_size_type* const block_total = m_scratch_space + word_count.value * (blockIdx.x - 1); for (unsigned i = threadIdx.y; i < word_count.value; ++i) { shared_accum[i] = block_total[i]; @@ -588,7 +604,7 @@ class ParallelScanHIPBase { typename Analysis::pointer_type(shared_data + word_count.value)); { - size_type* const block_total = + word_size_type* const block_total = shared_data + word_count.value * blockDim.y; for (unsigned i = threadIdx.y; i < word_count.value; ++i) { shared_accum[i] = block_total[i]; @@ -647,8 +663,9 @@ class ParallelScanHIPBase { // How many block are really needed for this much work: m_grid_x = (nwork + work_per_block - 1) / work_per_block; - m_scratch_space = Kokkos::Experimental::Impl::hip_internal_scratch_space( - m_policy.space(), Analysis::value_size(m_functor) * m_grid_x); + m_scratch_space = reinterpret_cast<word_size_type*>( + Kokkos::Experimental::Impl::hip_internal_scratch_space( + m_policy.space(), Analysis::value_size(m_functor) * m_grid_x)); m_scratch_flags = Kokkos::Experimental::Impl::hip_internal_scratch_flags( m_policy.space(), sizeof(size_type) * 1); @@ -734,7 +751,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>, DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIP>( Base::m_policy.space(), &m_returnvalue, - Base::m_scratch_space + (Base::m_grid_x - 1) * size / sizeof(int), + Base::m_scratch_space + (Base::m_grid_x - 1) * size / + sizeof(typename Base::word_size_type), size); } } diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp index 1091ad5ceadf6a14b2f162c49d797c6c9564d390..9002f695894e987d34fd1437bbcc1f3da7e2a04c 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ReduceScan.hpp @@ -225,11 +225,11 @@ struct HIPReductionsFunctor<FunctorType, false> { } } + template <typename SizeType> __device__ static inline bool scalar_inter_block_reduction( FunctorType const& functor, ::Kokkos::Experimental::HIP::size_type const block_count, - ::Kokkos::Experimental::HIP::size_type* const shared_data, - ::Kokkos::Experimental::HIP::size_type* const global_data, + SizeType* const shared_data, SizeType* const global_data, ::Kokkos::Experimental::HIP::size_type* const global_flags) { Scalar* const global_team_buffer_element = reinterpret_cast<Scalar*>(global_data); @@ -411,16 +411,14 @@ __device__ void hip_intra_block_reduce_scan( * Global reduce result is in the last threads' 'shared_data' location. */ -template <bool DoScan, class FunctorType> +template <bool DoScan, typename FunctorType, typename SizeType> __device__ bool hip_single_inter_block_reduce_scan_impl( FunctorType const& functor, ::Kokkos::Experimental::HIP::size_type const block_id, ::Kokkos::Experimental::HIP::size_type const block_count, - ::Kokkos::Experimental::HIP::size_type* const shared_data, - ::Kokkos::Experimental::HIP::size_type* const global_data, + SizeType* const shared_data, SizeType* const global_data, ::Kokkos::Experimental::HIP::size_type* const global_flags) { - using size_type = ::Kokkos::Experimental::HIP::size_type; - + using size_type = SizeType; using value_type = typename FunctorType::value_type; using pointer_type = typename FunctorType::pointer_type; @@ -518,13 +516,12 @@ __device__ bool hip_single_inter_block_reduce_scan_impl( return is_last_block; } -template <bool DoScan, typename FunctorType> +template <bool DoScan, typename FunctorType, typename SizeType> __device__ bool hip_single_inter_block_reduce_scan( FunctorType const& functor, ::Kokkos::Experimental::HIP::size_type const block_id, ::Kokkos::Experimental::HIP::size_type const block_count, - ::Kokkos::Experimental::HIP::size_type* const shared_data, - ::Kokkos::Experimental::HIP::size_type* const global_data, + SizeType* const shared_data, SizeType* const global_data, ::Kokkos::Experimental::HIP::size_type* const global_flags) { // If we are doing a reduction and we don't do an array reduction, we use the // reduction-only path. Otherwise, we use the common path between reduction diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp index eb85ed4709ed453f40856b05b07e76fd50e06430..d0bbc18da8a1c64839444f5abb8c4d507a01a30b 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp @@ -116,6 +116,7 @@ __device__ inline void hip_inter_warp_shuffle_reduction( value = result[0]; for (int i = 1; (i * step < max_active_thread) && (i < step_width); ++i) reducer.join(&value, &result[i]); + __syncthreads(); } template <typename ValueType, typename ReducerType> diff --git a/packages/kokkos/core/src/Kokkos_CopyViews.hpp b/packages/kokkos/core/src/Kokkos_CopyViews.hpp index 0a66ee9da71fdaf3bb2bf649fb1a7081e4651ea4..d859a5d8ae0f1908b35c4dc31aa9229cfd578bf6 100644 --- a/packages/kokkos/core/src/Kokkos_CopyViews.hpp +++ b/packages/kokkos/core/src/Kokkos_CopyViews.hpp @@ -3711,12 +3711,13 @@ namespace Impl { template <class T, class... P, class... ViewCtorArgs> inline std::enable_if_t< - (std::is_same< - typename Kokkos::View<T, P...>::memory_space, - typename Kokkos::View<T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::View<T, P...>::data_type, - typename Kokkos::View<T, P...>::HostMirror::data_type>::value), + !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space && + (std::is_same< + typename Kokkos::View<T, P...>::memory_space, + typename Kokkos::View<T, P...>::HostMirror::memory_space>::value && + std::is_same< + typename Kokkos::View<T, P...>::data_type, + typename Kokkos::View<T, P...>::HostMirror::data_type>::value), typename Kokkos::View<T, P...>::HostMirror> create_mirror_view(const Kokkos::View<T, P...>& src, const Impl::ViewCtorProp<ViewCtorArgs...>&) { @@ -3725,12 +3726,13 @@ create_mirror_view(const Kokkos::View<T, P...>& src, template <class T, class... P, class... ViewCtorArgs> inline std::enable_if_t< - !(std::is_same< - typename Kokkos::View<T, P...>::memory_space, - typename Kokkos::View<T, P...>::HostMirror::memory_space>::value && - std::is_same< - typename Kokkos::View<T, P...>::data_type, - typename Kokkos::View<T, P...>::HostMirror::data_type>::value), + !Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space && + !(std::is_same<typename Kokkos::View<T, P...>::memory_space, + typename Kokkos::View< + T, P...>::HostMirror::memory_space>::value && + std::is_same< + typename Kokkos::View<T, P...>::data_type, + typename Kokkos::View<T, P...>::HostMirror::data_type>::value), typename Kokkos::View<T, P...>::HostMirror> create_mirror_view(const Kokkos::View<T, P...>& src, const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { @@ -3738,25 +3740,33 @@ create_mirror_view(const Kokkos::View<T, P...>& src, } // Create a mirror view in a new space (specialization for same space) -template <class Space, class T, class... P, class... ViewCtorArgs> -std::enable_if_t<Impl::MirrorViewType<Space, T, P...>::is_same_memspace, - typename Impl::MirrorViewType<Space, T, P...>::view_type> -create_mirror_view(const Space&, const Kokkos::View<T, P...>& src, +template <class T, class... P, class... ViewCtorArgs, + class = std::enable_if_t< + Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>> +std::enable_if_t<Impl::MirrorViewType< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, + T, P...>::is_same_memspace, + typename Impl::MirrorViewType< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, + T, P...>::view_type> +create_mirror_view(const Kokkos::View<T, P...>& src, const Impl::ViewCtorProp<ViewCtorArgs...>&) { return src; } // Create a mirror view in a new space (specialization for different space) -template <class Space, class T, class... P, class... ViewCtorArgs> -std::enable_if_t<!Impl::MirrorViewType<Space, T, P...>::is_same_memspace, - typename Impl::MirrorViewType<Space, T, P...>::view_type> -create_mirror_view(const Space&, const Kokkos::View<T, P...>& src, +template <class T, class... P, class... ViewCtorArgs, + class = std::enable_if_t< + Impl::ViewCtorProp<ViewCtorArgs...>::has_memory_space>> +std::enable_if_t<!Impl::MirrorViewType< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, + T, P...>::is_same_memspace, + typename Impl::MirrorViewType< + typename Impl::ViewCtorProp<ViewCtorArgs...>::memory_space, + T, P...>::view_type> +create_mirror_view(const Kokkos::View<T, P...>& src, const Impl::ViewCtorProp<ViewCtorArgs...>& arg_prop) { - using MemorySpace = typename Space::memory_space; - using alloc_prop = Impl::ViewCtorProp<ViewCtorArgs..., MemorySpace>; - alloc_prop prop_copy(arg_prop); - - return Kokkos::Impl::create_mirror(src, prop_copy); + return Kokkos::Impl::create_mirror(src, arg_prop); } } // namespace Impl @@ -3815,9 +3825,10 @@ typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view( template <class Space, class T, class... P, typename Enable = std::enable_if_t<Kokkos::is_space<Space>::value>> typename Impl::MirrorViewType<Space, T, P...>::view_type create_mirror_view( - Kokkos::Impl::WithoutInitializing_t wi, Space const& space, + Kokkos::Impl::WithoutInitializing_t wi, Space const&, Kokkos::View<T, P...> const& v) { - return Impl::create_mirror_view(space, v, view_alloc(wi)); + return Impl::create_mirror_view( + v, view_alloc(typename Space::memory_space{}, wi)); } template <class T, class... P, class... ViewCtorArgs> diff --git a/packages/kokkos/core/src/Kokkos_View.hpp b/packages/kokkos/core/src/Kokkos_View.hpp index e92ed7d2e91395aef45292b3a3b3a4f5c9cd5cf7..f8dcfc869e195b93f2ed899b89f6f64be62c46e3 100644 --- a/packages/kokkos/core/src/Kokkos_View.hpp +++ b/packages/kokkos/core/src/Kokkos_View.hpp @@ -1754,7 +1754,10 @@ struct RankDataType<ValueType, 0> { }; template <unsigned N, typename... Args> -KOKKOS_FUNCTION std::enable_if_t<N == View<Args...>::Rank, View<Args...>> +KOKKOS_FUNCTION std::enable_if_t< + N == View<Args...>::Rank && + std::is_same<typename ViewTraits<Args...>::specialize, void>::value, + View<Args...>> as_view_of_rank_n(View<Args...> v) { return v; } @@ -1762,13 +1765,13 @@ as_view_of_rank_n(View<Args...> v) { // Placeholder implementation to compile generic code for DynRankView; should // never be called template <unsigned N, typename T, typename... Args> -std::enable_if_t< - N != View<T, Args...>::Rank, +KOKKOS_FUNCTION std::enable_if_t< + N != View<T, Args...>::Rank && + std::is_same<typename ViewTraits<T, Args...>::specialize, void>::value, View<typename RankDataType<typename View<T, Args...>::value_type, N>::type, Args...>> as_view_of_rank_n(View<T, Args...>) { - Kokkos::Impl::throw_runtime_exception( - "Trying to get at a View of the wrong rank"); + Kokkos::abort("Trying to get at a View of the wrong rank"); return {}; } diff --git a/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp index fafd825df297123e100ccf008069f24e4a2cf1e5..129a489387a46297bdd5ce17d3ad7873cbc1c80b 100644 --- a/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp +++ b/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp @@ -101,8 +101,8 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> { void push_work(const std::int32_t w) const noexcept { const std::int32_t N = m_graph.numRows(); - std::int32_t volatile* const ready_queue = &m_queue[0]; - std::int32_t volatile* const end_hint = &m_queue[2 * N + 1]; + std::int32_t* const ready_queue = &m_queue[0]; + std::int32_t* const end_hint = &m_queue[2 * N + 1]; // Push work to end of queue const std::int32_t j = atomic_fetch_add(end_hint, 1); @@ -134,14 +134,14 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> { std::int32_t pop_work() const noexcept { const std::int32_t N = m_graph.numRows(); - std::int32_t volatile* const ready_queue = &m_queue[0]; - std::int32_t volatile* const begin_hint = &m_queue[2 * N]; + std::int32_t* const ready_queue = &m_queue[0]; + std::int32_t* const begin_hint = &m_queue[2 * N]; // begin hint is guaranteed to be less than or equal to // actual begin location in the queue. - for (std::int32_t i = *begin_hint; i < N; ++i) { - const std::int32_t w = ready_queue[i]; + for (std::int32_t i = Kokkos::atomic_load(begin_hint); i < N; ++i) { + const std::int32_t w = Kokkos::atomic_load(&ready_queue[i]); if (w == END_TOKEN) { return END_TOKEN; @@ -169,7 +169,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> { const std::int32_t N = m_graph.numRows(); - std::int32_t volatile* const count_queue = &m_queue[N]; + std::int32_t* const count_queue = &m_queue[N]; const std::int32_t B = m_graph.row_map(w); const std::int32_t E = m_graph.row_map(w + 1); @@ -199,7 +199,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> { KOKKOS_INLINE_FUNCTION void operator()(const TagCount, int i) const noexcept { - std::int32_t volatile* const count_queue = &m_queue[m_graph.numRows()]; + std::int32_t* const count_queue = &m_queue[m_graph.numRows()]; atomic_increment(count_queue + m_graph.entries[i]); } diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp index 51921765baf249a1f1dacc57221bc4f4a398c79d..a9bc085912356ec90bbef0c63688cd3f91d11a95 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp @@ -47,6 +47,7 @@ #endif #include <Kokkos_Macros.hpp> +#include <impl/Kokkos_DeviceManagement.hpp> #if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(_OPENMP) @@ -164,7 +165,11 @@ void OpenMPTarget::impl_static_fence(const std::string& name) { name, Kokkos::Experimental::Impl::openmp_fence_is_static::yes); } -void OpenMPTarget::impl_initialize(InitializationSettings const&) { +void OpenMPTarget::impl_initialize(InitializationSettings const& settings) { + using Kokkos::Impl::get_gpu; + const int device_num = get_gpu(settings); + omp_set_default_device(device_num); + Impl::OpenMPTargetInternal::impl_singleton()->impl_initialize(); } void OpenMPTarget::impl_finalize() { diff --git a/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp b/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp index c1cb6a7d91b54b951f445131e43347eaaa33a027..ecece72cf958c937702dd18b4a22642e479c857c 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp @@ -110,10 +110,9 @@ KOKKOS_IMPL_HOST_FUNCTION inline uint64_t clock_tic_host() noexcept { return ((uint64_t)a) | (((uint64_t)d) << 32); -#elif defined(__powerpc) || defined(__powerpc__) || defined(__powerpc64__) || \ - defined(__POWERPC__) || defined(__ppc__) || defined(__ppc64__) +#elif defined(__powerpc64__) || defined(__ppc64__) - unsigned int cycles = 0; + unsigned long cycles = 0; asm volatile("mftb %0" : "=r"(cycles)); diff --git a/packages/kokkos/core/src/impl/Kokkos_Core.cpp b/packages/kokkos/core/src/impl/Kokkos_Core.cpp index f624e7a14cb21b4a395898125536ec9b55bfeaae..a5bd0032374ffc5e0e73627e33aeb3fa7c1b788e 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Core.cpp +++ b/packages/kokkos/core/src/impl/Kokkos_Core.cpp @@ -166,6 +166,8 @@ int get_device_count() { #elif defined(KOKKOS_ENABLE_OPENACC) return acc_get_num_devices( Kokkos::Experimental::Impl::OpenACC_Traits::dev_type); +#elif defined(KOKKOS_ENABLE_OPENMPTARGET) + return omp_get_num_devices(); #else Kokkos::abort("implementation bug"); return -1; @@ -426,11 +428,17 @@ int Kokkos::Impl::get_gpu(const InitializationSettings& settings) { Kokkos::abort("implementation bug"); } - auto const* local_rank_str = - std::getenv("OMPI_COMM_WORLD_LOCAL_RANK"); // OpenMPI - if (!local_rank_str) - local_rank_str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK"); // MVAPICH2 - if (!local_rank_str) local_rank_str = std::getenv("SLURM_LOCALID"); // SLURM + char const* local_rank_str = nullptr; + for (char const* env_var : { + "OMPI_COMM_WORLD_LOCAL_RANK", // OpenMPI + "MV2_COMM_WORLD_LOCAL_RANK", // MVAPICH2 + "MPI_LOCALRANKID", // MPICH + "SLURM_LOCALID", // SLURM + "PMI_LOCAL_RANK" // PMI + }) { + local_rank_str = std::getenv(env_var); + if (local_rank_str) break; + } // use first GPU available for execution if unable to detect local MPI rank if (!local_rank_str) { diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp index 738231677c600f6d928122269d848b1a2b51ac46..994dd0b2adf65d9a2440abeb5c4930b43a662d54 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp @@ -1128,9 +1128,8 @@ struct ViewOffset< KOKKOS_INLINE_FUNCTION constexpr ViewOffset( const ViewOffset<DimRHS, Kokkos::LayoutRight, void>& rhs) : m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) { - static_assert((DimRHS::rank == 0 && dimension_type::rank == 0) || - (DimRHS::rank == 1 && dimension_type::rank == 1 && - dimension_type::rank_dynamic == 1), + static_assert(((DimRHS::rank == 0 && dimension_type::rank == 0) || + (DimRHS::rank == 1 && dimension_type::rank == 1)), "ViewOffset LayoutLeft and LayoutRight are only compatible " "when rank <= 1"); } @@ -1778,8 +1777,7 @@ struct ViewOffset< const ViewOffset<DimRHS, Kokkos::LayoutLeft, void>& rhs) : m_dim(rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0) { static_assert((DimRHS::rank == 0 && dimension_type::rank == 0) || - (DimRHS::rank == 1 && dimension_type::rank == 1 && - dimension_type::rank_dynamic == 1), + (DimRHS::rank == 1 && dimension_type::rank == 1), "ViewOffset LayoutRight and LayoutLeft are only compatible " "when rank <= 1"); } @@ -3059,10 +3057,10 @@ struct ViewValueFunctor<DeviceType, ValueType, true /* is_scalar */> { std::is_trivially_copy_assignable<Dummy>::value> construct_shared_allocation() { // Shortcut for zero initialization - ValueType value{}; // On A64FX memset seems to do the wrong thing with regards to first touch // leading to the significant performance issues #ifndef KOKKOS_ARCH_A64FX + ValueType value{}; if (Impl::is_zero_byte(value)) { uint64_t kpID = 0; if (Kokkos::Profiling::profileLibraryLoaded()) { @@ -3539,9 +3537,7 @@ class ViewMapping< typename SrcTraits::array_layout>::value || std::is_same<typename DstTraits::array_layout, Kokkos::LayoutStride>::value || - (DstTraits::dimension::rank == 0) || - (DstTraits::dimension::rank == 1 && - DstTraits::dimension::rank_dynamic == 1) + (DstTraits::dimension::rank == 0) || (DstTraits::dimension::rank == 1) }; public: diff --git a/packages/kokkos/core/unit_test/CMakeLists.txt b/packages/kokkos/core/unit_test/CMakeLists.txt index 24f70c0ccb3208ca3db1acf82e08bfb56d1ef0de..16fdb39d1a36e9dd8b7d65bbe846c28b37fcf496 100644 --- a/packages/kokkos/core/unit_test/CMakeLists.txt +++ b/packages/kokkos/core/unit_test/CMakeLists.txt @@ -73,6 +73,7 @@ KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) SET(COMPILE_ONLY_SOURCES TestArray.cpp + TestCreateMirror.cpp TestDetectionIdiom.cpp TestInterOp.cpp TestLegionInteroperability.cpp @@ -86,6 +87,7 @@ ENDIF() KOKKOS_ADD_EXECUTABLE( TestCompileOnly SOURCES + TestCompileMain.cpp ${COMPILE_ONLY_SOURCES} ) diff --git a/packages/kokkos/core/unit_test/TestCompileMain.cpp b/packages/kokkos/core/unit_test/TestCompileMain.cpp new file mode 100644 index 0000000000000000000000000000000000000000..237c8ce181774d991a9dbdd8cacf1a5fb9f199f1 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestCompileMain.cpp @@ -0,0 +1 @@ +int main() {} diff --git a/packages/kokkos/core/unit_test/TestCreateMirror.cpp b/packages/kokkos/core/unit_test/TestCreateMirror.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e8b3b6ea105950267a862a78ebcb632120e74f66 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestCreateMirror.cpp @@ -0,0 +1,126 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include <Kokkos_Core.hpp> + +template <typename TestView, typename MemorySpace> +void check_memory_space(TestView, MemorySpace) { + static_assert( + std::is_same<typename TestView::memory_space, MemorySpace>::value, ""); +} + +template <class View> +auto host_mirror_test_space(View) { + return std::conditional_t< + Kokkos::SpaceAccessibility<Kokkos::HostSpace, + typename View::memory_space>::accessible, + typename View::memory_space, Kokkos::HostSpace>{}; +} + +template <typename View> +void test_create_mirror_properties(const View& view) { + using namespace Kokkos; + using DeviceMemorySpace = typename DefaultExecutionSpace::memory_space; + + // clang-format off + + // create_mirror + check_memory_space(create_mirror(WithoutInitializing, view), host_mirror_test_space(view)); + check_memory_space(create_mirror( view), host_mirror_test_space(view)); + check_memory_space(create_mirror(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{}); + check_memory_space(create_mirror( DefaultExecutionSpace{}, view), DeviceMemorySpace{}); + + // create_mirror_view + check_memory_space(create_mirror_view(WithoutInitializing, view), host_mirror_test_space(view)); + check_memory_space(create_mirror_view( view), host_mirror_test_space(view)); + check_memory_space(create_mirror_view(WithoutInitializing, DefaultExecutionSpace{}, view), DeviceMemorySpace{}); + check_memory_space(create_mirror_view( DefaultExecutionSpace{}, view), DeviceMemorySpace{}); + + // create_mirror view_alloc + check_memory_space(create_mirror(view_alloc(WithoutInitializing), view), host_mirror_test_space(view)); + check_memory_space(create_mirror(view_alloc(), view), host_mirror_test_space(view)); + check_memory_space(create_mirror(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{}); + check_memory_space(create_mirror(view_alloc( DeviceMemorySpace{}), view), DeviceMemorySpace{}); + + // create_mirror_view view_alloc + check_memory_space(create_mirror_view(view_alloc(WithoutInitializing), view), host_mirror_test_space(view)); + check_memory_space(create_mirror_view(view_alloc(), view), host_mirror_test_space(view)); + check_memory_space(create_mirror_view(view_alloc(WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{}); + check_memory_space(create_mirror_view(view_alloc( DeviceMemorySpace{}), view), DeviceMemorySpace{}); + + // create_mirror view_alloc + execution space + check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing), view), host_mirror_test_space(view)); + check_memory_space(create_mirror(view_alloc(DefaultHostExecutionSpace{}), view), host_mirror_test_space(view)); + check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{}); + check_memory_space(create_mirror(view_alloc(DefaultExecutionSpace{}, DeviceMemorySpace{}), view), DeviceMemorySpace{}); + + // create_mirror_view view_alloc + execution space + check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing), view), host_mirror_test_space(view)); + check_memory_space(create_mirror_view(view_alloc(DefaultHostExecutionSpace{}), view), host_mirror_test_space(view)); + check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, WithoutInitializing, DeviceMemorySpace{}), view), DeviceMemorySpace{}); + check_memory_space(create_mirror_view(view_alloc(DefaultExecutionSpace{}, DeviceMemorySpace{}), view), DeviceMemorySpace{}); + + // create_mirror_view_and_copy + check_memory_space(create_mirror_view_and_copy(HostSpace{}, view), HostSpace{}); + check_memory_space(create_mirror_view_and_copy(DeviceMemorySpace{}, view), DeviceMemorySpace{}); + + // create_mirror_view_and_copy view_alloc + check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}), view), HostSpace{}); + check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}), view), DeviceMemorySpace{}); + + // create_mirror_view_and_copy view_alloc + execution space + check_memory_space(create_mirror_view_and_copy(view_alloc(HostSpace{}, DefaultHostExecutionSpace{}), view), HostSpace{}); + check_memory_space(create_mirror_view_and_copy(view_alloc(DeviceMemorySpace{}, DefaultExecutionSpace{}), view), DeviceMemorySpace{}); + + // clang-format on +} + +void test() { + Kokkos::View<int*, Kokkos::DefaultExecutionSpace> device_view("device view", + 10); + Kokkos::View<int*, Kokkos::HostSpace> host_view("host view", 10); + + test_create_mirror_properties(device_view); + test_create_mirror_properties(host_view); +} diff --git a/packages/kokkos/core/unit_test/TestDetectionIdiom.cpp b/packages/kokkos/core/unit_test/TestDetectionIdiom.cpp index f87fda615643c5a75ef5ee4da7349bab0eea40cd..23da339cae03246084c0e67e8781d08972fb864e 100644 --- a/packages/kokkos/core/unit_test/TestDetectionIdiom.cpp +++ b/packages/kokkos/core/unit_test/TestDetectionIdiom.cpp @@ -92,5 +92,3 @@ static_assert(std::is_same<difference_type<Woof>, int>::value, static_assert(std::is_same<difference_type<Bark>, std::ptrdiff_t>::value, "Bark's difference_type should be ptrdiff_t!"); } // namespace Example - -int main() {} diff --git a/packages/kokkos/core/unit_test/TestScan.hpp b/packages/kokkos/core/unit_test/TestScan.hpp index 1a4056af07d3f9584b105cd536e5abca051b30c2..356ffde9565aaf40035e033748056cbb3028f678 100644 --- a/packages/kokkos/core/unit_test/TestScan.hpp +++ b/packages/kokkos/core/unit_test/TestScan.hpp @@ -45,20 +45,23 @@ #include <Kokkos_Core.hpp> #include <cstdio> -namespace Test { +namespace { -template <class Device> +template <class Device, class T, T ImbalanceSz> struct TestScan { using execution_space = Device; - using value_type = int64_t; + using value_type = T; Kokkos::View<int, Device, Kokkos::MemoryTraits<Kokkos::Atomic> > errors; KOKKOS_INLINE_FUNCTION void operator()(const int iwork, value_type& update, const bool final_pass) const { - const value_type n = iwork + 1; - const value_type imbalance = ((1000 <= n) && (0 == n % 1000)) ? 1000 : 0; + const value_type n = iwork + 1; + const value_type imbalance = + ((ImbalanceSz <= n) && (value_type(0) == n % ImbalanceSz)) + ? ImbalanceSz + : value_type(0); // Insert an artificial load imbalance @@ -133,12 +136,29 @@ struct TestScan { } } }; +} // namespace TEST(TEST_CATEGORY, scan) { - TestScan<TEST_EXECSPACE>::test_range(1, 1000); - TestScan<TEST_EXECSPACE>(0); - TestScan<TEST_EXECSPACE>(100000); - TestScan<TEST_EXECSPACE>(10000000); - TEST_EXECSPACE().fence(); + constexpr auto imbalance_size = 1000; + TestScan<TEST_EXECSPACE, int64_t, imbalance_size>::test_range(1, 1000); + TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(0); + TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(100000); + TestScan<TEST_EXECSPACE, int64_t, imbalance_size>(10000000); +} + +TEST(TEST_CATEGORY, small_size_scan) { + constexpr auto imbalance_size = 10; // Pick to not overflow... + TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(0); + TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(5); + TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>(10); + TestScan<TEST_EXECSPACE, std::int8_t, imbalance_size>( + static_cast<std::size_t>( + std::sqrt(std::numeric_limits<std::int8_t>::max()))); + constexpr auto short_imbalance_size = 100; // Pick to not overflow... + TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(0); + TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(5); + TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>(100); + TestScan<TEST_EXECSPACE, std::int16_t, short_imbalance_size>( + static_cast<std::size_t>( + std::sqrt(std::numeric_limits<std::int16_t>::max()))); } -} // namespace Test diff --git a/packages/kokkos/core/unit_test/TestTeam.hpp b/packages/kokkos/core/unit_test/TestTeam.hpp index f1d0f9cb3b8a37f35f9b4962e2f183f26701072c..3f05b2ef66a04783a94f854259253cb984411819 100644 --- a/packages/kokkos/core/unit_test/TestTeam.hpp +++ b/packages/kokkos/core/unit_test/TestTeam.hpp @@ -1616,6 +1616,73 @@ struct TestTeamPolicyHandleByValue { } // namespace +namespace { +template <typename ExecutionSpace> +struct TestRepeatedTeamReduce { + static constexpr int ncol = 1500; // nothing special, just some work + + KOKKOS_FUNCTION void operator()( + const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) + const { + // non-divisible by power of two to make triggering problems easier + constexpr int nlev = 129; + constexpr auto pi = Kokkos::Experimental::pi_v<double>; + double b = 0.; + for (int ri = 0; ri < 10; ++ri) { + // The contributions here must be sufficiently complex, simply adding ones + // wasn't enough to trigger the bug. + const auto g1 = [&](const int k, double &acc) { + acc += Kokkos::cos(pi * double(k) / nlev); + }; + const auto g2 = [&](const int k, double &acc) { + acc += Kokkos::sin(pi * double(k) / nlev); + }; + double a1, a2; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, nlev), g1, a1); + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, nlev), g2, a2); + b += a1; + b += a2; + } + const auto h = [&]() { + const auto col = team.league_rank(); + v(col) = b + col; + }; + Kokkos::single(Kokkos::PerTeam(team), h); + } + + KOKKOS_FUNCTION void operator()(const int i, int &bad) const { + if (v(i) != v(0) + i) { + ++bad; + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Failing at %d!\n", i); + } + } + + TestRepeatedTeamReduce() : v("v", ncol) { test(); } + + void test() { + int team_size_recommended = + Kokkos::TeamPolicy<ExecutionSpace>(1, 1).team_size_recommended( + *this, Kokkos::ParallelForTag()); + // Choose a non-recommened (non-power of two for GPUs) team size + int team_size = team_size_recommended > 1 ? team_size_recommended - 1 : 1; + + // The failure was non-deterministic so run the test a bunch of times + for (int it = 0; it < 100; ++it) { + Kokkos::parallel_for( + Kokkos::TeamPolicy<ExecutionSpace>(ncol, team_size, 1), *this); + + int bad = 0; + Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(0, ncol), + *this, bad); + ASSERT_EQ(bad, 0) << " Failing in iteration " << it; + } + } + + Kokkos::View<double *, ExecutionSpace> v; +}; + +} // namespace + } // namespace Test /*--------------------------------------------------------------------------*/ diff --git a/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp b/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp index 469bba23b73ee9bd316f7c2fbcd9389144f03e12..4d4f3b1f4d34eeae219b9436922162f7279ac8ca 100644 --- a/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp +++ b/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp @@ -134,5 +134,15 @@ TEST(TEST_CATEGORY, team_parallel_dummy_with_reducer_and_scratch_space) { } } +TEST(TEST_CATEGORY, repeated_team_reduce) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (std::is_same<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>::value) + GTEST_SKIP() << "skipping since team_reduce for OpenMPTarget is not " + "properly implemented"; +#endif + + TestRepeatedTeamReduce<TEST_EXECSPACE>(); +} + } // namespace Test #endif diff --git a/packages/kokkos/core/unit_test/TestViewIsAssignable.hpp b/packages/kokkos/core/unit_test/TestViewIsAssignable.hpp index 03c3b977edeab7ec5b51c406da65b0e089f5a0de..3ac392d3e98860fe536a07fbff43cc7d5fc1aecd 100644 --- a/packages/kokkos/core/unit_test/TestViewIsAssignable.hpp +++ b/packages/kokkos/core/unit_test/TestViewIsAssignable.hpp @@ -92,8 +92,18 @@ TEST(TEST_CATEGORY, view_is_assignable) { View<double*, left, d_exec>>::test(false, false, 10); // Layout assignment + Impl::TestAssignability<View<int, left, d_exec>, + View<int, right, d_exec>>::test(true, true); Impl::TestAssignability<View<int*, left, d_exec>, View<int*, right, d_exec>>::test(true, true, 10); + Impl::TestAssignability<View<int[5], left, d_exec>, + View<int*, right, d_exec>>::test(false, false, 10); + Impl::TestAssignability<View<int[10], left, d_exec>, + View<int*, right, d_exec>>::test(false, true, 10); + Impl::TestAssignability<View<int*, left, d_exec>, + View<int[5], right, d_exec>>::test(true, true); + Impl::TestAssignability<View<int[5], left, d_exec>, + View<int[10], right, d_exec>>::test(false, false); // This could be made possible (due to the degenerate nature of the views) but // we do not allow this yet diff --git a/packages/kokkos/master_history.txt b/packages/kokkos/master_history.txt index a1a87ce3199d10449b92be4a8e09ecaa790a303f..bd639c847e03cdd0909fc83ccf6d0843148d6bea 100644 --- a/packages/kokkos/master_history.txt +++ b/packages/kokkos/master_history.txt @@ -29,3 +29,4 @@ tag: 3.5.00 date: 11:19:2021 master: c28a8b03 release: 21b879e4 tag: 3.6.00 date: 04:14:2022 master: 2834f94a release: 6ea708ff tag: 3.6.01 date: 06:16:2022 master: b52f8c83 release: afe9b404 tag: 3.7.00 date: 08:25:2022 master: d19aab99 release: 0018e5fb +tag: 3.7.01 date: 12:01:2022 master: 61d7db55 release: d3bb8cfe diff --git a/packages/kokkos/simd/cmake/Dependencies.cmake b/packages/kokkos/simd/cmake/Dependencies.cmake index 5e29157369c9ab8cab935a1bfc4c6dad2fdd0296..1d71d8af341181f689a6a8bf63036b67584cb138 100644 --- a/packages/kokkos/simd/cmake/Dependencies.cmake +++ b/packages/kokkos/simd/cmake/Dependencies.cmake @@ -1,5 +1,5 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_REQUIRED_PACKAGES KokkosCore - LIB_OPTIONAL_TPLS Pthread CUDA HWLOC HPX + LIB_OPTIONAL_TPLS Pthread CUDA HWLOC TEST_OPTIONAL_TPLS CUSPARSE ) diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp index 2166fa3cb78e70af887ff7f74e2cac9f141bf1de..1815adb4a7621c8b4b3d93ac626c417f1c42644b 100644 --- a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp @@ -76,7 +76,7 @@ namespace Impl { /// instances in other translation units, we must update this CUDA global /// variable based on the Host global variable prior to running any kernels /// that will use it. -/// That is the purpose of the ensure_cuda_lock_arrays_on_device function. +/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro. __device__ #ifdef __CUDACC_RDC__ __constant__ extern @@ -138,42 +138,33 @@ namespace { static int lock_array_copied = 0; inline int eliminate_warning_for_lock_array() { return lock_array_copied; } } // namespace - -#ifdef __CUDACC_RDC__ -inline -#else -static -#endif - void - copy_cuda_lock_arrays_to_device() { - if (lock_array_copied == 0) { - cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_DEVICE, - &CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h, - sizeof(int32_t*)); - cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_NODE, - &CUDA_SPACE_ATOMIC_LOCKS_NODE_h, - sizeof(int32_t*)); - } - lock_array_copied = 1; -} - } // namespace Impl } // namespace desul +/* It is critical that this code be a macro, so that it will + capture the right address for desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE + putting this in an inline function will NOT do the right thing! */ +#define DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \ + { \ + if (::desul::Impl::lock_array_copied == 0) { \ + cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE, \ + &::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h, \ + sizeof(int32_t*)); \ + cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE, \ + &::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE_h, \ + sizeof(int32_t*)); \ + } \ + ::desul::Impl::lock_array_copied = 1; \ + } #endif /* defined( __CUDACC__ ) */ #endif /* defined( DESUL_HAVE_CUDA_ATOMICS ) */ -namespace desul { - #if defined(__CUDACC_RDC__) || (!defined(__CUDACC__)) -inline void ensure_cuda_lock_arrays_on_device() {} +#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() #else -static inline void ensure_cuda_lock_arrays_on_device() { - Impl::copy_cuda_lock_arrays_to_device(); -} +#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \ + DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() #endif -} // namespace desul - -#endif /* #ifndef DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_ */ +#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP_ */ diff --git a/packages/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp b/packages/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp index 19944b378e2c47090dbe3ce28913017a3f308933..cb8482c5da8b83bb1fc6323dea09fffce86d115b 100644 --- a/packages/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp +++ b/packages/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp @@ -70,7 +70,7 @@ void init_lock_arrays_cuda() { "init_lock_arrays_cuda: cudaMalloc host locks"); auto error_sync1 = cudaDeviceSynchronize(); - copy_cuda_lock_arrays_to_device(); + DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE(); check_error_and_throw_cuda(error_sync1, "init_lock_arrays_cuda: post mallocs"); init_lock_arrays_cuda_kernel<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>(); auto error_sync2 = cudaDeviceSynchronize(); @@ -85,7 +85,7 @@ void finalize_lock_arrays_cuda() { CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr; CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr; #ifdef __CUDACC_RDC__ - copy_cuda_lock_arrays_to_device(); + DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE(); #endif }