diff --git a/packages/kokkos/.clang-format-ignore b/packages/kokkos/.clang-format-ignore
index b163a2bfeaf4e865925fb96f46260fd3ec2cfcb9..43d242c3106a29c063e0258bbd4e4553f66f883c 100644
--- a/packages/kokkos/.clang-format-ignore
+++ b/packages/kokkos/.clang-format-ignore
@@ -1,2 +1,3 @@
 core/unit_test/config/results/*
 tpls/gtest/gtest/*
+core/src/desul/*
diff --git a/packages/kokkos/.clang-tidy b/packages/kokkos/.clang-tidy
index 207a105c5bdf60b807db528d612aac89e6bb88b6..2b0d6e51d438948c2d5ef85e100c97ca16184e9b 100644
--- a/packages/kokkos/.clang-tidy
+++ b/packages/kokkos/.clang-tidy
@@ -1,3 +1,3 @@
-Checks: '-*,kokkos-*,modernize-use-using,modernize-use-nullptr'
+Checks: '-*,kokkos-*,modernize-use-using,modernize-use-nullptr,cppcoreguidelines-pro-type-cstyle-cast'
 FormatStyle: file
 HeaderFilterRegex: '.*/*.hpp'
diff --git a/packages/kokkos/.github/workflows/continuous-integration-workflow.yml b/packages/kokkos/.github/workflows/continuous-integration-workflow.yml
index a9dc0ec86cd77c49cb6958d01608ca47e5e0dab9..b76167f330a87eaf79af25f706c33d3e910865d1 100644
--- a/packages/kokkos/.github/workflows/continuous-integration-workflow.yml
+++ b/packages/kokkos/.github/workflows/continuous-integration-workflow.yml
@@ -40,29 +40,29 @@ jobs:
           path: ~/.ccache
           key: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${github.ref}-${{ github.sha }}
           restore-keys: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${{github.ref}}
-      - name: Get trial license
-        if: ${{ matrix.cxx == 'icpc' }}
-        run: |
-          mkdir ~/Licenses
-          curl https://dynamicinstaller.intel.com/api/v2/license > ~/Licenses/intel.lic
       - name: maybe_disable_death_tests
         if: ${{ matrix.distro == 'fedora:rawhide' }}
         run: echo "GTEST_FILTER=-*DeathTest*" >> $GITHUB_ENV
-      - name: build-and-test
+      - name: CMake
         run: |
-          ccache -z
-          cmake \
+          cmake -B builddir \
             -DCMAKE_INSTALL_PREFIX=/usr \
             -DKokkos_ENABLE_HWLOC=ON \
             -DKokkos_ENABLE_OPENMP=${{ matrix.openmp }} \
             -DKokkos_ENABLE_TESTS=ON \
             -DKokkos_ENABLE_EXAMPLES=ON \
+            -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
+            -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
             -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \
-            -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \
-            -DBUILD_NAME=${{ matrix.distro }}-${{ matrix.cxx }} \
-            -DBUILD_JOBS=2 -DBINARY_DIR=builddir -DSITE=GitHub-Linux \
-            -P cmake/KokkosCI.cmake
+            -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }}
+      - name: Build
+        run: |
+          ccache -z
+          cmake --build builddir --parallel 2
           ccache -s
+      - name: Tests
+        working-directory: builddir
+        run: ctest --output-on-failure
       - name: Test DESTDIR Install
         run: DESTDIR=${PWD}/install cmake --build builddir --target install && rm -rf ${PWD}/install/usr && rmdir ${PWD}/install
       - name: Install
diff --git a/packages/kokkos/.github/workflows/osx.yml b/packages/kokkos/.github/workflows/osx.yml
index 855b557c829a609f34b82c7e5f307eef60cf0ede..178af12405cc2f6fc24a5ae46adf034b1c73a94e 100644
--- a/packages/kokkos/.github/workflows/osx.yml
+++ b/packages/kokkos/.github/workflows/osx.yml
@@ -21,15 +21,19 @@ jobs:
 
     steps:
       - uses: actions/checkout@v2
-      - name: build-and-test
+      - name: configure
         run:
-          cmake
+          cmake -B build .
             -DKokkos_ENABLE_${{ matrix.backend }}=On
             -DCMAKE_CXX_FLAGS="-Werror"
             -DCMAKE_CXX_STANDARD=14
             -DKokkos_ENABLE_COMPILER_WARNINGS=ON
+            -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF
             -DKokkos_ENABLE_TESTS=On
             -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }}
-            -DBUILD_NAME=macOS-${{ matrix.backend }}
-            -DTARGET=install -DBUILD_JOBS=2 -DSITE=GitHub-OSX
-            -P cmake/KokkosCI.cmake
+      - name: build
+        run:
+          cmake --build build --parallel 2
+      - name: test
+        working-directory: build
+        run: ctest --output-on-failure
diff --git a/packages/kokkos/.gitrepo b/packages/kokkos/.gitrepo
index 85e71521db3fbaa780bb77fb42f5c3e74ae26800..bfbe5e6fd3ec3ae381fe5adbd8b39d0797bff2fa 100644
--- a/packages/kokkos/.gitrepo
+++ b/packages/kokkos/.gitrepo
@@ -6,7 +6,7 @@
 [subrepo]
 	remote = git@github.com:kokkos/kokkos.git
 	branch = master
-	commit = c28a8b03288b185f846ddfb1b7c08213e12e2634
-	parent = 2c8a5742df289f02f5ca31bce1e293dbfdb1701e
+	commit = 2879e23507bcb21adb739d6317b3430f665de4a6
+	parent = 36833c0c0fc1a841eaed63df6b7d34609307f2a5
 	method = merge
 	cmdver = 0.4.3
diff --git a/packages/kokkos/.jenkins b/packages/kokkos/.jenkins
index 001171d648e7cfb2236d17439720562707faaab4..09e8515e96f2bb8255bcaba7304780b484f303ea 100644
--- a/packages/kokkos/.jenkins
+++ b/packages/kokkos/.jenkins
@@ -5,9 +5,12 @@ pipeline {
         CCACHE_DIR = '/tmp/ccache'
         CCACHE_MAXSIZE = '10G'
         CCACHE_CPP2 = 'true'
-        BUILD_JOBS = 8
-        SITE = 'Jenkins'
     }
+
+    options {
+        timeout(time: 6, unit: 'HOURS')
+    }
+
     stages {
         stage('Clang-Format') {
             agent {
@@ -36,7 +39,7 @@ pipeline {
                     }
                     steps {
                         sh 'ccache --zero-stats'
-                        sh '''rm -rf build && \
+                        sh '''rm -rf build && mkdir -p build && cd build && \
                               cmake \
                                 -DCMAKE_BUILD_TYPE=Release \
                                 -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
@@ -44,13 +47,15 @@ pipeline {
                                 -DCMAKE_CXX_FLAGS="-Werror -Wno-unknown-cuda-version -Wno-gnu-zero-variadic-macro-arguments" \
                                 -DKokkos_ARCH_VOLTA70=ON \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
+                                -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
+                                -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
                                 -DKokkos_ENABLE_EXAMPLES=ON \
                                 -DKokkos_ENABLE_TESTS=ON \
                                 -DKokkos_ENABLE_SYCL=ON \
                                 -DKokkos_ENABLE_UNSUPPORTED_ARCHS=ON \
                                 -DCMAKE_CXX_STANDARD=17 \
-                                -DBUILD_NAME=${STAGE_NAME} \
-                              -P cmake/KokkosCI.cmake'''
+                              .. && \
+                              make -j8 && ctest --verbose'''
                     }
                     post {
                         always {
@@ -58,12 +63,12 @@ pipeline {
                         }
                     }
                 }
-                stage('HIP-ROCm-3.8-C++14') {
+                stage('HIP-ROCm-4.2-C++14') {
                     agent {
                         dockerfile {
                             filename 'Dockerfile.hipcc'
                             dir 'scripts/docker'
-                            additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:3.8'
+                            additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:4.2'
                             label 'rocm-docker && vega'
                             args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES'
                         }
@@ -72,24 +77,23 @@ pipeline {
                         OMP_NUM_THREADS = 8
                         OMP_PLACES = 'threads'
                         OMP_PROC_BIND = 'spread'
-                        LC_ALL = 'C'
                     }
                     steps {
                         sh 'ccache --zero-stats'
                         sh 'echo "/opt/rocm/llvm/lib" > /etc/ld.so.conf.d/llvm.conf && ldconfig'
-                        sh '''rm -rf build && \
+                        sh '''rm -rf build && mkdir -p build && cd build && \
                               cmake \
                                 -DCMAKE_BUILD_TYPE=Debug \
                                 -DCMAKE_CXX_COMPILER=hipcc \
                                 -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument -DNDEBUG" \
                                 -DCMAKE_CXX_STANDARD=14 \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
+                                -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \
                                 -DKokkos_ENABLE_TESTS=ON \
                                 -DKokkos_ENABLE_HIP=ON \
-                                -DKokkos_ARCH_VEGA906=ON \
                                 -DKokkos_ENABLE_OPENMP=ON \
-                                -DBUILD_NAME=${STAGE_NAME} \
-                              -P cmake/KokkosCI.cmake'''
+                              .. && \
+                              make -j8 && ctest --verbose'''
                     }
                     post {
                         always {
@@ -97,33 +101,73 @@ pipeline {
                         }
                     }
                 }
-                stage('HIP-ROCm-3.8-C++17') {
+                stage('HIP-ROCm-4.2-C++17') {
                     agent {
                         dockerfile {
                             filename 'Dockerfile.hipcc'
                             dir 'scripts/docker'
-                            additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:3.8'
+                            additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:4.2'
                             label 'rocm-docker && vega'
                             args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES'
                         }
                     }
-                    environment {
-                        LC_ALL = 'C'
-                    }
                     steps {
                         sh 'ccache --zero-stats'
-                        sh '''rm -rf build && \
+                        sh '''rm -rf build && mkdir -p build && cd build && \
                               cmake \
                                 -DCMAKE_BUILD_TYPE=RelWithDebInfo \
                                 -DCMAKE_CXX_COMPILER=hipcc \
                                 -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \
                                 -DCMAKE_CXX_STANDARD=17 \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
+                                -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
+                                -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
                                 -DKokkos_ENABLE_TESTS=ON \
                                 -DKokkos_ENABLE_HIP=ON \
+                              .. && \
+                              make -j8 && ctest --verbose'''
+                    }
+                    post {
+                        always {
+                            sh 'ccache --show-stats'
+                        }
+                    }
+                }
+                stage('OPENMPTARGET-ROCm-4.2') {
+                    agent {
+                        dockerfile {
+                            filename 'Dockerfile.hipcc'
+                            dir 'scripts/docker'
+                            additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:4.2'
+                            label 'rocm-docker && vega && AMD_Radeon_Instinct_MI60'
+                            args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES'
+                        }
+                    }
+                    environment {
+                        OMP_NUM_THREADS = 8
+                        OMP_PLACES = 'threads'
+                        OMP_PROC_BIND = 'spread'
+                        LC_ALL = 'C'
+                    }
+                    steps {
+                        sh 'ccache --zero-stats'
+                        sh 'echo "/opt/rocm/llvm/lib" > /etc/ld.so.conf.d/llvm.conf && ldconfig'
+                        sh '''rm -rf build && \
+                              cmake \
+                                -Bbuild \
+                                -DCMAKE_BUILD_TYPE=Debug \
+                                -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
+                                -DCMAKE_CXX_STANDARD=17 \
+                                -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
+                                -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \
+                                -DKokkos_ENABLE_TESTS=ON \
+                                -DKokkos_ENABLE_OPENMPTARGET=ON \
+                                -DKokkos_ENABLE_OPENMP=ON \
                                 -DKokkos_ARCH_VEGA906=ON \
-                                -DBUILD_NAME=${STAGE_NAME} \
-                              -P cmake/KokkosCI.cmake'''
+                              && \
+                              cmake --build build --parallel ${BUILD_JOBS} && \
+                              cd build && ctest --output-on-failure
+                        '''
                     }
                     post {
                         always {
@@ -142,19 +186,21 @@ pipeline {
                     }
                     steps {
                         sh 'ccache --zero-stats'
-                        sh '''rm -rf build && \
+                        sh '''rm -rf build && mkdir -p build && cd build && \
                               cmake \
                                 -DCMAKE_BUILD_TYPE=RelWithDebInfo \
                                 -DCMAKE_CXX_COMPILER=clang++ \
                                 -DCMAKE_CXX_FLAGS="-Wno-unknown-cuda-version -Werror -Wno-undefined-internal -Wno-pass-failed" \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
+                                -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
+                                -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
                                 -DKokkos_ENABLE_TESTS=ON \
                                 -DKokkos_ENABLE_TUNING=ON \
                                 -DKokkos_ENABLE_OPENMPTARGET=ON \
                                 -DKokkos_ARCH_VOLTA70=ON \
                                 -DCMAKE_CXX_STANDARD=17 \
-                                -DBUILD_NAME=${STAGE_NAME} \
-                              -P cmake/KokkosCI.cmake'''
+                              .. && \
+                              make -j8 && ctest --verbose'''
                     }
                     post {
                         always {
@@ -173,7 +219,7 @@ pipeline {
                     }
                     steps {
                         sh 'ccache --zero-stats'
-                        sh '''rm -rf build && \
+                        sh '''rm -rf build && mkdir -p build && cd build && \
                               cmake \
                                 -DCMAKE_BUILD_TYPE=Release \
                                 -DCMAKE_CXX_CLANG_TIDY="clang-tidy;-warnings-as-errors=*" \
@@ -182,13 +228,15 @@ pipeline {
                                 -DCMAKE_CXX_FLAGS=-Werror \
                                 -DCMAKE_CXX_STANDARD=14 \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
+                                -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
+                                -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
                                 -DKokkos_ENABLE_TESTS=ON \
                                 -DKokkos_ENABLE_CUDA=ON \
                                 -DKokkos_ENABLE_CUDA_LAMBDA=ON \
                                 -DKokkos_ENABLE_TUNING=ON \
                                 -DKokkos_ARCH_VOLTA70=ON \
-                                -DBUILD_NAME=${STAGE_NAME} \
-                              -P cmake/KokkosCI.cmake'''
+                              .. && \
+                              make -j8 && ctest --verbose'''
                     }
                     post {
                         always {
@@ -244,7 +292,7 @@ pipeline {
                     steps {
                         sh 'ccache --zero-stats'
                         sh '''rm -rf install && mkdir -p install && \
-                              rm -rf build && \
+                              rm -rf build && mkdir -p build && cd build && \
                               cmake \
                                 -DCMAKE_BUILD_TYPE=Release \
                                 -DCMAKE_CXX_COMPILER=g++-8 \
@@ -256,10 +304,12 @@ pipeline {
                                 -DKokkos_ENABLE_CUDA_LAMBDA=OFF \
                                 -DKokkos_ENABLE_CUDA_UVM=ON \
                                 -DKokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON \
-                                -DCMAKE_INSTALL_PREFIX=${PWD}/install \
-                                -DBUILD_NAME=${STAGE_NAME} \
-                                -DTARGET=install \
-                              -P cmake/KokkosCI.cmake && \
+                                -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
+                                -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
+                                -DCMAKE_INSTALL_PREFIX=${PWD}/../install \
+                              .. && \
+                              make -j8 install && \
+                              cd .. && \
                               rm -rf build-tests && mkdir -p build-tests && cd build-tests && \
                               export CMAKE_PREFIX_PATH=${PWD}/../install && \
                               cmake \
@@ -302,7 +352,7 @@ pipeline {
                     }
                     steps {
                         sh 'ccache --zero-stats'
-                        sh '''rm -rf build && \
+                        sh '''rm -rf build && mkdir -p build && cd build && \
                               cmake \
                                 -DCMAKE_BUILD_TYPE=Debug \
                                 -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
@@ -312,14 +362,14 @@ pipeline {
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
                                 -DKokkos_ENABLE_DEBUG=ON \
                                 -DKokkos_ENABLE_DEBUG_BOUNDS_CHECK=ON \
+                                -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \
                                 -DKokkos_ENABLE_TESTS=ON \
                                 -DKokkos_ENABLE_CUDA=ON \
                                 -DKokkos_ENABLE_CUDA_LAMBDA=ON \
                                 -DKokkos_ENABLE_LIBDL=OFF \
-                                -DBUILD_NAME=${STAGE_NAME} \
-                                -DTARGET=install \
-                              -P cmake/KokkosCI.cmake && \
-                              cd example/build_cmake_in_tree && \
+                              .. && \
+                              make -j8 && ctest --verbose && \
+                              cd ../example/build_cmake_in_tree && \
                               rm -rf build && mkdir -p build && cd build && \
                               cmake -DCMAKE_CXX_STANDARD=14 .. && make -j8 && ctest --verbose'''
                     }
@@ -342,18 +392,21 @@ pipeline {
                         OMP_PROC_BIND = 'true'
                     }
                     steps {
-                        sh '''rm -rf build && \
+                        sh '''rm -rf build && mkdir -p build && cd build && \
                               cmake \
                                 -DCMAKE_BUILD_TYPE=Release \
                                 -DCMAKE_CXX_STANDARD=14 \
                                 -DCMAKE_CXX_FLAGS=-Werror \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
+                                -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
+                                -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
                                 -DKokkos_ENABLE_TESTS=ON \
                                 -DKokkos_ENABLE_OPENMP=ON \
                                 -DKokkos_ENABLE_LIBDL=OFF \
-                                -DBUILD_NAME=${STAGE_NAME} \
-                              -P cmake/KokkosCI.cmake && \
-                              gcc -I$PWD/core/src core/unit_test/tools/TestCInterface.c'''
+                                -DKokkos_ENABLE_LIBQUADMATH=ON \
+                                -DCMAKE_PREFIX_PATH=/usr/local/lib/gcc/x86_64-unknown-linux-gnu/5.3.0 \
+                              .. && \
+                              make -j8 && ctest --verbose && gcc -I$PWD/../core/src/ ../core/unit_test/tools/TestCInterface.c'''
                     }
                 }
             }
diff --git a/packages/kokkos/.travis.yml b/packages/kokkos/.travis.yml
index 04ef01c1602cf87aae3e39225037d65f49651f62..87d0fd5cf6ed8b9ebc158d1eb8bbe91d56101963 100644
--- a/packages/kokkos/.travis.yml
+++ b/packages/kokkos/.travis.yml
@@ -67,14 +67,13 @@ install:
 
 before_script:
   - ccache -z
-  - if [[ ${COVERAGE} ]]; then export CXX="${CXX} --coverage"; export BUILD_NAME_SUFFIX="-Coverage"; fi
+  - if [[ ${COVERAGE} ]]; then export CXX="${CXX} --coverage"; fi
   - if [[ ! ${CMAKE_BUILD_TYPE} ]]; then export CXXFLAGS="${CXXFLAGS} -O2"; fi
 
 script:
   - export OMP_NUM_THREADS=2
   - export OMP_PLACES=threads
   - export OMP_PROC_BIND=spread
-  - export BUILD_JOBS=2
   # LD_LIBRARY_PATH workaround to find clang's libomp: https://github.com/travis-ci/travis-ci/issues/8613
   - if [[ ${CC} = clang ]]; then export LD_LIBRARY_PATH=/usr/local/clang/lib${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH; fi
   # enable ccache for clang on linux and add CCACHE_CPP2 to avoid 'Argument unused during compilation -I...' warning
@@ -82,17 +81,17 @@ script:
       ln -s /usr/bin/ccache $HOME/bin/clang++;
       export CCACHE_CPP2=yes;
     fi
-  - cmake
+  - mkdir build &&
+    pushd build &&
+    cmake ..
           ${BACKEND:+-DKokkos_ENABLE_${BACKEND}=On}
           -DCMAKE_CXX_FLAGS="${CXXFLAGS} -Werror"
           -DCMAKE_CXX_STANDARD=14
           -DKokkos_ENABLE_COMPILER_WARNINGS=ON
           -DKokkos_ENABLE_TESTS=On
-          ${CMAKE_BUILD_TYPE:+-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}}
-          -DBUILD_NAME="${CC}-${BACKEND}${BUILD_NAME_SUFFIX}"
-          -DSITE=Travis
-          -P cmake/KokkosCI.cmake &&
-    pushd build &&
+          ${CMAKE_BUILD_TYPE:+-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}} &&
+    make VERBOSE=1 -j2 &&
+    travis_wait 60 make test CTEST_OUTPUT_ON_FAILURE=1 &&
     make install DESTDIR=${PWD}/install && rm -rf ${PWD}/install/usr/local && rmdir ${PWD}/install/usr &&
     popd
 
diff --git a/packages/kokkos/CHANGELOG.md b/packages/kokkos/CHANGELOG.md
index 7bb6de4cd924051c621bec0ac2cca5f734960e9e..2e779791dde2a83394662f640f90626f66696f28 100644
--- a/packages/kokkos/CHANGELOG.md
+++ b/packages/kokkos/CHANGELOG.md
@@ -1,5 +1,165 @@
 # Change Log
 
+## [3.5.00](https://github.com/kokkos/kokkos/tree/3.5.00) (2021-10-19)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/3.4.01...3.5.00)
+
+### Features:
+
+- Add support for quad-precision math functions/traits [\#4098](https://github.com/kokkos/kokkos/pull/4098)
+- Adding ExecutionSpace partitioning function [\#4096](https://github.com/kokkos/kokkos/pull/4096)
+- Improve Python Interop Capabilities [\#4065](https://github.com/kokkos/kokkos/pull/4065)
+- Add half_t Kokkos::rand specialization [\#3922](https://github.com/kokkos/kokkos/pull/3922)
+- Add math special functions: erf, erfcx, expint1, Bessel functions, Hankel functions [\#3920](https://github.com/kokkos/kokkos/pull/3920)
+- Add missing common mathematical functions [\#4043](https://github.com/kokkos/kokkos/pull/4043) [\#4036](https://github.com/kokkos/kokkos/pull/4036) [\#4034](https://github.com/kokkos/kokkos/pull/4034)
+- Let the numeric traits be SFINAE-friendly [\#4038](https://github.com/kokkos/kokkos/pull/4038)
+- Add Desul atomics - enabling memory-order and memory-scope parameters [\#3247](https://github.com/kokkos/kokkos/pull/3247)
+- Add detection idiom from the C++ standard library extension version 2 [\#3980](https://github.com/kokkos/kokkos/pull/3980)
+- Fence Profiling Support in all backends [\#3966](https://github.com/kokkos/kokkos/pull/3966) [\#4304](https://github.com/kokkos/kokkos/pull/4304) [\#4258](https://github.com/kokkos/kokkos/pull/4258) [\#4232](https://github.com/kokkos/kokkos/pull/4232)
+- Significant SYCL enhancements (see below)
+
+### Deprecations:
+
+- Deprecate CUDA_SAFE_CALL and HIP_SAFE_CALL [\#4249](https://github.com/kokkos/kokkos/pull/4249)
+- Deprecate Kokkos::Impl::Timer (Kokkos::Timer has been available for a long time) [\#4201](https://github.com/kokkos/kokkos/pull/4201)
+- Deprecate Experimental::MasterLock [\#4094](https://github.com/kokkos/kokkos/pull/4094)
+- Deprecate Kokkos_TaskPolicy.hpp (headers got reorganized, doesn't remove functionality) [\#4011](https://github.com/kokkos/kokkos/pull/4011)
+- Deprecate backward compatibility features [\#3978](https://github.com/kokkos/kokkos/pull/3978)
+- Update and deprecate is_space::host_memory/execution/mirror_space [\#3973](https://github.com/kokkos/kokkos/pull/3973)
+
+
+### Backends and Archs Enhancements:
+
+- Enabling constbitset constructors in kernels [\#4296](https://github.com/kokkos/kokkos/pull/4296)
+- Use ZeroMemset in View constructor to improve performance [\#4226](https://github.com/kokkos/kokkos/pull/4226)
+- Use memset in deep_copy [\#3944](https://github.com/kokkos/kokkos/pull/3944)
+- Add missing fence() calls in resize(View) that effectively do deep_copy(resized, orig) [\#4212](https://github.com/kokkos/kokkos/pull/4212)
+- Avoid allocations in resize and realloc [\#4207](https://github.com/kokkos/kokkos/pull/4207)
+- StaticCsrGraph: use device type instead of execution space to construct views [\#3991](https://github.com/kokkos/kokkos/pull/3991)
+- Consider std::sort when view is accessible from host [\#3929](https://github.com/kokkos/kokkos/pull/3929)
+- Fix CPP20 warnings except for volatile [\#4312](https://github.com/kokkos/kokkos/pull/4312)
+
+#### SYCL:
+- Introduce SYCLHostUSMSpace [\#4268](https://github.com/kokkos/kokkos/pull/4268)
+- Implement SYCL TeamPolicy for vector_size > 1 [\#4183](https://github.com/kokkos/kokkos/pull/4183)
+- Enable 64bit ranges for SYCL [\#4211](https://github.com/kokkos/kokkos/pull/4211)
+- Don't print SYCL device info in execution space intialization [\#4168](https://github.com/kokkos/kokkos/pull/4168)
+- Improve SYCL MDRangePolicy performance [\#4161](https://github.com/kokkos/kokkos/pull/4161)
+- Use sub_groups in SYCL parallel_scan [\#4147](https://github.com/kokkos/kokkos/pull/4147)
+- Implement subgroup reduction for SYCL RangePolicy parallel_reduce [\#3940](https://github.com/kokkos/kokkos/pull/3940)
+- Use DPC++ broadcast extension in SYCL team_broadcast [\#4103](https://github.com/kokkos/kokkos/pull/4103)
+- Only fence in SYCL parallel_reduce for non-device-accessible result_ptr [\#4089](https://github.com/kokkos/kokkos/pull/4089)
+- Improve fencing behavior in SYCL backend [\#4088](https://github.com/kokkos/kokkos/pull/4088)
+- Fence all registered SYCL queues before deallocating memory [\#4086](https://github.com/kokkos/kokkos/pull/4086)
+- Implement SYCL::print_configuration [\#3992](https://github.com/kokkos/kokkos/pull/3992)
+- Reuse scratch memory in parallel_scan and TeamPolicy (decreases memory footprint) [\#3899](https://github.com/kokkos/kokkos/pull/3899) [\#3889](https://github.com/kokkos/kokkos/pull/3889)
+
+#### CUDA:
+- Cuda improve heuristic for blocksize [\#4271](https://github.com/kokkos/kokkos/pull/4271)
+- Don't use [[deprecated]] for nvcc [\#4229](https://github.com/kokkos/kokkos/pull/4229)
+- Improve error message for NVHPC as host compiler [\#4227](https://github.com/kokkos/kokkos/pull/4227)
+- Update support for cuda reductions to work with types < 4bytes [\#4156](https://github.com/kokkos/kokkos/pull/4156)
+- Fix incompatible team size deduction in rare cases parallel_reduce [\#4142](https://github.com/kokkos/kokkos/pull/4142)
+- Remove UVM usage in DynamicView [\#4129](https://github.com/kokkos/kokkos/pull/4129)
+- Remove dependency between core and containers [\#4114](https://github.com/kokkos/kokkos/pull/4114)
+- Adding opt-in CudaMallocSync support when using CUDA version >= 11.2 [\#4026](https://github.com/kokkos/kokkos/pull/4026) [\#4233](https://github.com/kokkos/kokkos/pull/4233)
+- Fix a potential race condition in the CUDA backend [\#3999](https://github.com/kokkos/kokkos/pull/3999)
+
+#### HIP:
+- Implement new blocksize deduction method for HIP Backend [\#3953](https://github.com/kokkos/kokkos/pull/3953)
+- Add multiple LaunchMechanism [\#3820](https://github.com/kokkos/kokkos/pull/3820)
+- Make HIP backend thread-safe [\#4170](https://github.com/kokkos/kokkos/pull/4170)
+
+#### Serial:
+- Refactor Serial backend and fix thread-safety issue [\#4053](https://github.com/kokkos/kokkos/pull/4053)
+
+#### OpenMPTarget:
+- OpenMPTarget: support array reductions in RangePolicy [\#4040](https://github.com/kokkos/kokkos/pull/4040)
+- OpenMPTarget: add MDRange parallel_reduce [\#4032](https://github.com/kokkos/kokkos/pull/4032)
+- OpenMPTarget: Fix bug in for the case of a reducer. [\#4044](https://github.com/kokkos/kokkos/pull/4044)
+- OpenMPTarget: verify process fix [\#4041](https://github.com/kokkos/kokkos/pull/4041)
+
+### Implemented enhancements BuildSystem
+
+#### Important BuildSystem Updates:
+- Use hipcc architecture autodetection when Kokkos_ARCH is not set [\#3941](https://github.com/kokkos/kokkos/pull/3941)
+- Introduce Kokkos_ENABLE_DEPRECATION_WARNINGS and remove deprecated code with Kokkos_ENABLE_DEPRECATED_CODE_3 [\#4106](https://github.com/kokkos/kokkos/pull/4106) [\#3855](https://github.com/kokkos/kokkos/pull/3855)
+
+#### Other Improvements:
+- Add allow-unsupported-compiler flag to nvcc-wrapper [\#4298](https://github.com/kokkos/kokkos/pull/4298)
+- nvcc_wrapper: fix errors in argument handling [\#3993](https://github.com/kokkos/kokkos/pull/3993)
+- Adds support for -time=<file> and -time <file> in nvcc_wrapper [\#4015](https://github.com/kokkos/kokkos/pull/4015)
+- nvcc_wrapper: suppress duplicates of GPU architecture and RDC flags [\#3968](https://github.com/kokkos/kokkos/pull/3968)
+- Fix TMPDIR support in nvcc_wrapper [\#3792](https://github.com/kokkos/kokkos/pull/3792)
+- NVHPC: update PGI compiler arch flags [\#4133](https://github.com/kokkos/kokkos/pull/4133)
+- Replace PGI with NVHPC (works for both) [\#4196](https://github.com/kokkos/kokkos/pull/4196)
+- Make sure that KOKKOS_CXX_HOST_COMPILER_ID is defined [\#4235](https://github.com/kokkos/kokkos/pull/4235)
+- Add options to Makefile builds for deprecated code and warnings [\#4215](https://github.com/kokkos/kokkos/pull/4215)
+- Use KOKKOS_CXX_HOST_COMPILER_ID for identifying CPU arch flags [\#4199](https://github.com/kokkos/kokkos/pull/4199)
+- Added support for Cray Clang to Makefile.kokkos [\#4176](https://github.com/kokkos/kokkos/pull/4176)
+- Add XLClang as compiler [\#4120](https://github.com/kokkos/kokkos/pull/4120)
+- Keep quoted compiler flags when passing to Trilinos [\#3987](https://github.com/kokkos/kokkos/pull/3987)
+- Add support for AMD Zen3 CPU architecture [\#3972](https://github.com/kokkos/kokkos/pull/3972)
+- Rename IntelClang to IntelLLVM [\#3945](https://github.com/kokkos/kokkos/pull/3945)
+- Add cppcoreguidelines-pro-type-cstyle-cast to clang-tidy [\#3522](https://github.com/kokkos/kokkos/pull/3522)
+- Add sve bit size definition for A64FX [\#3947](https://github.com/kokkos/kokkos/pull/3947) [\#3946](https://github.com/kokkos/kokkos/pull/3946)
+- Remove KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES [\#4150](https://github.com/kokkos/kokkos/pull/4150)
+
+### Other Changes:
+
+#### Tool Enhancements:
+
+- Retrieve original value from a point in a MultidimensionalSparseTuningProblem [\#3977](https://github.com/kokkos/kokkos/pull/3977)
+- Allow extension of built-in tuners with additional tuning axes [\#3961](https://github.com/kokkos/kokkos/pull/3961)
+- Added a categorical tuner [\#3955](https://github.com/kokkos/kokkos/pull/3955)
+
+
+#### Miscellaneous:
+
+- hpcbind: Use double quotes around $@ when invoking user command [\#4284](https://github.com/kokkos/kokkos/pull/4284)
+- Add file and line to error message [\#3985](https://github.com/kokkos/kokkos/pull/3985)
+- Fix compiler warnings when compiling with nvc++ [\#4198](https://github.com/kokkos/kokkos/pull/4198)
+- Add OpenMPTarget CI build on AMD GPUs [\#4055](https://github.com/kokkos/kokkos/pull/4055)
+- CI: icpx is now part of intel container [\#4002](https://github.com/kokkos/kokkos/pull/4002)
+
+### Incompatibilities:
+
+- Remove pre CUDA 9 KOKKOS_IMPL_CUDA_* macros [\#4138](https://github.com/kokkos/kokkos/pull/4138)
+
+### Bug Fixes:
+- UnorderedMap::clear() should zero the size() [\#4130](https://github.com/kokkos/kokkos/pull/4130)
+- Add memory fence for HostSharedPtr::cleanup() [\#4144](https://github.com/kokkos/kokkos/pull/4144)
+- SYCL: Fix race conditions in TeamPolicy::parallel_reduce [\#4418](https://github.com/kokkos/kokkos/pull/4418)
+- Adding missing memory fence to serial exec space fence. [\#4292](https://github.com/kokkos/kokkos/pull/4292)
+- Fix using external SYCL queues in tests [\#4291](https://github.com/kokkos/kokkos/pull/4291)
+- Fix digits10 bug [\#4281](https://github.com/kokkos/kokkos/pull/4281)
+- Fixes constexpr errors with frounding-math on gcc < 10. [\#4278](https://github.com/kokkos/kokkos/pull/4278)
+- Fix compiler flags for PGI/NVHPC [\#4264](https://github.com/kokkos/kokkos/pull/4264)
+- Fix Zen2/3 also implying Zen Arch with Makefiles [\#4260](https://github.com/kokkos/kokkos/pull/4260)
+- Kokkos_Cuda.hpp: Fix shadow warning with cuda/11.0 [\#4252](https://github.com/kokkos/kokkos/pull/4252)
+- Fix issue w/ static initialization of function attributes [\#4242](https://github.com/kokkos/kokkos/pull/4242)
+- Disable long double hypot test on Power systems [\#4221](https://github.com/kokkos/kokkos/pull/4221)
+- Fix false sharing in random pool [\#4218](https://github.com/kokkos/kokkos/pull/4218)
+- Fix a missing memory_fence for debug shared alloc code [\#4216](https://github.com/kokkos/kokkos/pull/4216)
+- Fix two xl issues [\#4179](https://github.com/kokkos/kokkos/pull/4179)
+- Makefile.kokkos: fix (standard_in) 1: syntax error [\#4173](https://github.com/kokkos/kokkos/pull/4173)
+- Fixes for query_device example [\#4172](https://github.com/kokkos/kokkos/pull/4172)
+- Fix a bug when using HIP atomic with Kokkos::Complex [\#4159](https://github.com/kokkos/kokkos/pull/4159)
+- Fix mistaken logic in pthread creation [\#4157](https://github.com/kokkos/kokkos/pull/4157)
+- Define KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION when requesting Kokkos_ENABLE_AGGRESSIVE_VECTORIZATION=ON [\#4107](https://github.com/kokkos/kokkos/pull/4107)
+- Fix compilation with latest MSVC version [\#4102](https://github.com/kokkos/kokkos/pull/4102)
+- Fix incorrect macro definitions when compiling with Intel compiler on Windows [\#4087](https://github.com/kokkos/kokkos/pull/4087)
+- Fixup global buffer overflow in hand rolled string manipulation [\#4070](https://github.com/kokkos/kokkos/pull/4070)
+- Fixup heap buffer overflow in cmd line args parsing unit tests [\#4069](https://github.com/kokkos/kokkos/pull/4069)
+- Only add quotes in compiler flags for Trilinos if necessary [\#4067](https://github.com/kokkos/kokkos/pull/4067)
+- Fixed invocation of tools init callbacks [\#4061](https://github.com/kokkos/kokkos/pull/4061)
+- Work around SYCL JIT compiler issues with static variables [\#4013](https://github.com/kokkos/kokkos/pull/4013)
+- Fix TestDetectionIdiom.cpp test inclusion for Trilinos/TriBITS [\#4010](https://github.com/kokkos/kokkos/pull/4010)
+- Fixup allocation headers with OpenMPTarget backend [\#4003](https://github.com/kokkos/kokkos/pull/4003)
+- Add missing specialization for OMPT to Kokkos Random [\#3967](https://github.com/kokkos/kokkos/pull/3967)
+- Disable hypot long double test on power arches [\#3962](https://github.com/kokkos/kokkos/pull/3962)
+- Use different EBO workaround for MSVC (rebased) [\#3924](https://github.com/kokkos/kokkos/pull/3924)
+- Fix SYCL Kokkos::Profiling::(de)allocateData calls [\#3928](https://github.com/kokkos/kokkos/pull/3928)
+
 ## [3.4.01](https://github.com/kokkos/kokkos/tree/3.4.01) (2021-05-19)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/3.4.00...3.4.01)
 
diff --git a/packages/kokkos/CMakeLists.txt b/packages/kokkos/CMakeLists.txt
index 9452027d8ee99592293cba43b7a60a7d2c0c3bbc..1b6753f983db34e64bc1e10bfc3f008c6fec5ede 100644
--- a/packages/kokkos/CMakeLists.txt
+++ b/packages/kokkos/CMakeLists.txt
@@ -111,8 +111,8 @@ ENDIF()
 
 
 set(Kokkos_VERSION_MAJOR 3)
-set(Kokkos_VERSION_MINOR 4)
-set(Kokkos_VERSION_PATCH 01)
+set(Kokkos_VERSION_MINOR 5)
+set(Kokkos_VERSION_PATCH 00)
 set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
 math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
 
@@ -210,7 +210,12 @@ IF (KOKKOS_HAS_TRILINOS)
   # which needs another workaround.
   SET(KOKKOS_COMPILE_OPTIONS_TMP)
   FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS})
-    LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP \"${OPTION}\")
+    STRING(FIND "${OPTION}" " " OPTION_HAS_WHITESPACE)
+    IF(OPTION_HAS_WHITESPACE EQUAL -1)
+      LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "${OPTION}")
+    ELSE()
+      LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "\"${OPTION}\"")
+    ENDIF()
   ENDFOREACH()
   STRING(REPLACE ";" " " KOKKOSCORE_COMPILE_OPTIONS "${KOKKOS_COMPILE_OPTIONS_TMP}")
   LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_COMPILE_OPTIONS})
diff --git a/packages/kokkos/Makefile.kokkos b/packages/kokkos/Makefile.kokkos
index bda8572073a326320dab54084080bd57115ce791..7ab18f5894e8880bd0584a4815f2260e07772cfb 100644
--- a/packages/kokkos/Makefile.kokkos
+++ b/packages/kokkos/Makefile.kokkos
@@ -1,19 +1,19 @@
 # Default settings common options.
 
 KOKKOS_VERSION_MAJOR = 3
-KOKKOS_VERSION_MINOR = 4
-KOKKOS_VERSION_PATCH = 01
+KOKKOS_VERSION_MINOR = 5
+KOKKOS_VERSION_PATCH = 00
 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
 
-# Options: Cuda,HIP,OpenMP,Pthread,Serial
+# Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Pthread,Serial
 #KOKKOS_DEVICES ?= "OpenMP"
 KOKKOS_DEVICES ?= "Pthread"
-# Options: 
+# Options:
 # Intel:    KNC,KNL,SNB,HSW,BDW,SKX
 # NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
 # ARM:      ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
 # IBM:      BGQ,Power7,Power8,Power9
-# AMD-GPUS: Vega900,Vega906,Vega908
+# AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3
 KOKKOS_ARCH ?= ""
 # Options: yes,no
@@ -22,7 +22,7 @@ KOKKOS_DEBUG ?= "no"
 KOKKOS_USE_TPLS ?= ""
 # Options: c++14,c++1y,c++17,c++1z,c++2a
 KOKKOS_CXX_STANDARD ?= "c++14"
-# Options: aggressive_vectorization,disable_profiling,enable_large_mem_tests,disable_complex_align
+# Options: aggressive_vectorization,disable_profiling,enable_large_mem_tests,disable_complex_align,disable_deprecated_code,enable_deprecation_warnings
 KOKKOS_OPTIONS ?= ""
 KOKKOS_CMAKE ?= "no"
 KOKKOS_TRIBITS ?= "no"
@@ -70,7 +70,7 @@ KOKKOS_INTERNAL_USE_MEMKIND := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),exper
 
 # Check for advanced settings.
 KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),compiler_warnings)
-KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(call kokkos_has_string,$(KOKKOS_OPTIONS),aggressive_vectorization)
+KOKKOS_INTERNAL_AGGRESSIVE_VECTORIZATION := $(call kokkos_has_string,$(KOKKOS_OPTIONS),aggressive_vectorization)
 KOKKOS_INTERNAL_ENABLE_TUNING := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_tuning)
 KOKKOS_INTERNAL_DISABLE_COMPLEX_ALIGN := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_complex_align)
 KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_dualview_modify_check)
@@ -82,6 +82,9 @@ KOKKOS_INTERNAL_CUDA_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS
 KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_lambda)
 KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_constexpr)
 KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH := $(call kokkos_has_string,$(KOKKOS_HPX_OPTIONS),enable_async_dispatch)
+KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_desul_atomics)
+KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_deprecated_code)
+KOKKOS_INTERNAL_ENABLE_DEPRECATION_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_deprecation_warnings)
 
 KOKKOS_INTERNAL_HIP_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_HIP_OPTIONS),rdc)
 
@@ -102,6 +105,7 @@ endif
 # Check for other Execution Spaces.
 KOKKOS_INTERNAL_USE_CUDA := $(call kokkos_has_string,$(KOKKOS_DEVICES),Cuda)
 KOKKOS_INTERNAL_USE_HIP := $(call kokkos_has_string,$(KOKKOS_DEVICES),HIP)
+KOKKOS_INTERNAL_USE_SYCL := $(call kokkos_has_string,$(KOKKOS_DEVICES),SYCL)
 KOKKOS_INTERNAL_USE_OPENMPTARGET := $(call kokkos_has_string,$(KOKKOS_DEVICES),OpenMPTarget)
 
 KOKKOS_DEVICELIST =
@@ -123,11 +127,18 @@ endif
 ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
   KOKKOS_DEVICELIST += HIP
 endif
+KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER := $(shell expr $(KOKKOS_INTERNAL_ENABLE_CXX17) \
+                                                  + $(KOKKOS_INTERNAL_ENABLE_CXX20) \
+                                                  + $(KOKKOS_INTERNAL_ENABLE_CXX2A))
+ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+  KOKKOS_DEVICELIST += SYCL
+  ifneq ($(KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER), 1)
+    $(error SYCL backend requires C++17 or newer)
+  endif
+
+endif
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
   KOKKOS_DEVICELIST += OPENMPTARGET
-  KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER := $(shell expr $(KOKKOS_INTERNAL_ENABLE_CXX17) \
-                                                    + $(KOKKOS_INTERNAL_ENABLE_CXX20) \
-                                                    + $(KOKKOS_INTERNAL_ENABLE_CXX2A))
   ifneq ($(KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER), 1)
     $(error OpenMPTarget backend requires C++17 or newer)
   endif
@@ -158,6 +169,8 @@ KOKKOS_INTERNAL_COMPILER_XL          := $(strip $(shell $(CXX) -qversion       2
 KOKKOS_INTERNAL_COMPILER_CRAY        := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "CC-"))
 KOKKOS_INTERNAL_COMPILER_NVCC        := $(strip $(shell echo "$(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep -c nvcc)>0" | bc))
 KOKKOS_INTERNAL_COMPILER_CLANG       := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),clang)
+KOKKOS_INTERNAL_COMPILER_CRAY_CLANG  := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "clang++"))
+KOKKOS_INTERNAL_COMPILER_INTEL_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),oneAPI)
 KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple clang)
 KOKKOS_INTERNAL_COMPILER_HCC         := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC)
 KOKKOS_INTERNAL_COMPILER_GCC         := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),GCC)
@@ -237,7 +250,11 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
   KOKKOS_INTERNAL_OPENMP_FLAG := -mp
 else
   ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY_CLANG), 1)
+    KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
+    else
     KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp
+    endif
   else
     ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1)
       KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp
@@ -249,7 +266,11 @@ else
           # OpenMP is turned on by default in Cray compiler environment.
           KOKKOS_INTERNAL_OPENMP_FLAG :=
         else
-          KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
+          ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL_CLANG), 1)
+            KOKKOS_INTERNAL_OPENMP_FLAG := -fiopenmp
+          else
+            KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
+          endif
         endif
       endif
     endif
@@ -307,6 +328,13 @@ KOKKOS_INTERNAL_USE_ARCH_BDW := $(call kokkos_has_string,$(KOKKOS_ARCH),BDW)
 KOKKOS_INTERNAL_USE_ARCH_SKX := $(call kokkos_has_string,$(KOKKOS_ARCH),SKX)
 KOKKOS_INTERNAL_USE_ARCH_KNL := $(call kokkos_has_string,$(KOKKOS_ARCH),KNL)
 
+KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen9)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen11)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen12LP)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelDG1)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelXeHP)
+
 # NVIDIA based.
 NVCC_WRAPPER := $(KOKKOS_PATH)/bin/nvcc_wrapper
 KOKKOS_INTERNAL_USE_ARCH_KEPLER30 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler30)
@@ -374,20 +402,25 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_
 KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX)
 KOKKOS_INTERNAL_USE_ARCH_ZEN3 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen3)
 KOKKOS_INTERNAL_USE_ARCH_ZEN2 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen2)
-KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen)
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN3), 0)
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN2), 0)
+    KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen)
+  endif
+endif
 KOKKOS_INTERNAL_USE_ARCH_VEGA900 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega900)
 KOKKOS_INTERNAL_USE_ARCH_VEGA906 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega906)
 KOKKOS_INTERNAL_USE_ARCH_VEGA908 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega908)
+KOKKOS_INTERNAL_USE_ARCH_VEGA90A := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega90A)
 
 # Any AVX?
 KOKKOS_INTERNAL_USE_ARCH_SSE42      := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM))
 KOKKOS_INTERNAL_USE_ARCH_AVX        := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX))
-KOKKOS_INTERNAL_USE_ARCH_AVX2       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2)) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
+KOKKOS_INTERNAL_USE_ARCH_AVX2       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
 KOKKOS_INTERNAL_USE_ARCH_AVX512MIC  := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL))
 KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SKX))
 
 # Decide what ISA level we are able to support.
-KOKKOS_INTERNAL_USE_ISA_X86_64    := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2)) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
+KOKKOS_INTERNAL_USE_ISA_X86_64    := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
 KOKKOS_INTERNAL_USE_ISA_KNC       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNC))
 KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER8) + $(KOKKOS_INTERNAL_USE_ARCH_POWER9))
 KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER7))
@@ -396,8 +429,8 @@ KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POW
 KOKKOS_INTERNAL_USE_TM            := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_SKX))
 
 # Incompatible flags?
-KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc ))
-KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc))
+KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc)
+KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1") | bc)
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1)
   $(error Defined Multiple Host architectures: KOKKOS_ARCH=$(KOKKOS_ARCH) )
@@ -432,6 +465,10 @@ KOKKOS_LINK_FLAGS =
 KOKKOS_SRC =
 KOKKOS_HEADERS =
 
+#ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
+  KOKKOS_LIBS += -latomic
+#endif
+
 # Generating the KokkosCore_config.h file.
 
 KOKKOS_INTERNAL_CONFIG_TMP=KokkosCore_config.tmp
@@ -468,6 +505,10 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
   tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_HIP')
 endif
 
+ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+  tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_SYCL')
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
   tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMPTARGET')
   ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
@@ -523,6 +564,12 @@ endif
 
 #only add the c++ standard flags if this is not CMake
 tmp := $(call kokkos_append_header,"/* General Settings */")
+ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEPRECATED_CODE_3")
+endif
+ifeq ($(KOKKOS_INTERNAL_ENABLE_DEPRECATION_WARNINGS), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEPRECATION_WARNINGS")
+endif
 ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX14), 1)
 ifneq ($(KOKKOS_STANDALONE_CMAKE), yes)
   KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX14_FLAG)
@@ -625,8 +672,10 @@ endif
 
 tmp := $(call kokkos_append_header,"/* Optimization Settings */")
 
-ifeq ($(KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION), 1)
+ifeq ($(KOKKOS_INTERNAL_AGGRESSIVE_VECTORIZATION), 1)
+  # deprecated
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION")
 endif
 
 tmp := $(call kokkos_append_header,"/* Cuda Settings */")
@@ -1156,6 +1205,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
     tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA908")
     KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx908
   endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA90A), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HIP 90A")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA90A")
+    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx90a
+  endif
 
 
   KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.cpp)
@@ -1174,6 +1228,52 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
   endif
 endif
 
+# Figure out the architecture flag for SYCL.
+ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+  # Lets start with adding architecture defines
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9-"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN9")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN11")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen11"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN12LP")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen12lp"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_DG1")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device dg1"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_XEHP")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device xehp"
+  endif
+
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/SYCL/*.cpp)
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/SYCL/*.hpp)
+
+  KOKKOS_CXXFLAGS+=-fsycl -fno-sycl-id-queries-fit-in-int -fsycl-unnamed-lambda
+  KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_SYCL_ARCH_FLAG)
+  KOKKOS_LDFLAGS+=-fsycl
+  KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_SYCL_ARCH_FLAG)
+endif
+
+ifeq ($(KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_DESUL_ATOMICS")
+endif
 
 KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1)
 
@@ -1185,57 +1285,63 @@ endif
 
 ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0)
   tmp := $(shell cp KokkosCore_config.tmp KokkosCore_config.h)
-endif
 
-# Functions for generating config header file
-kokkos_start_config_header = $(shell sed 's~@INCLUDE_NEXT_FILE@~~g' $(KOKKOS_PATH)/cmake/KokkosCore_Config_HeaderSet.in > $1)
-kokkos_update_config_header = $(shell sed 's~@HEADER_GUARD_TAG@~$1~g' $2 > $3)
-kokkos_append_config_header = $(shell echo $1 >> $2))
-tmp := $(call kokkos_start_config_header, "KokkosCore_Config_FwdBackend.tmp")
-tmp := $(call kokkos_start_config_header, "KokkosCore_Config_SetupBackend.tmp")
-tmp := $(call kokkos_start_config_header, "KokkosCore_Config_DeclareBackend.tmp")
-tmp := $(call kokkos_start_config_header, "KokkosCore_Config_PostInclude.tmp")
-tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp")
-tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp")
-tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp")
-tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp")
-ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_CUDA.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_CUDA.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_Cuda.hpp>","KokkosCore_Config_SetupBackend.hpp")
-   ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1)
-   else
-   endif
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMPTARGET.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMPTARGET.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HIP.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HIP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_HIP.hpp>","KokkosCore_Config_SetupBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMP.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_THREADS.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_THREADS.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HPX.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HPX.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_SERIAL.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_SERIAL.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HBWSpace.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HBWSpace.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  # Functions for generating config header file
+  kokkos_start_config_header = $(shell sed 's~@INCLUDE_NEXT_FILE@~~g' $(KOKKOS_PATH)/cmake/KokkosCore_Config_HeaderSet.in > $1)
+  kokkos_update_config_header = $(shell sed 's~@HEADER_GUARD_TAG@~$1~g' $2 > $3)
+  kokkos_append_config_header = $(shell echo $1 >> $2))
+  tmp := $(call kokkos_start_config_header, "KokkosCore_Config_FwdBackend.tmp")
+  tmp := $(call kokkos_start_config_header, "KokkosCore_Config_SetupBackend.tmp")
+  tmp := $(call kokkos_start_config_header, "KokkosCore_Config_DeclareBackend.tmp")
+  tmp := $(call kokkos_start_config_header, "KokkosCore_Config_PostInclude.tmp")
+  tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp")
+  tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp")
+  tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp")
+  tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp")
+  ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_CUDA.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_CUDA.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_Cuda.hpp>","KokkosCore_Config_SetupBackend.hpp")
+    ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1)
+    else
+    endif
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMPTARGET.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMPTARGET.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_SYCL.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_SYCL.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_SYCL.hpp>","KokkosCore_Config_SetupBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HIP.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HIP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_HIP.hpp>","KokkosCore_Config_SetupBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMP.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_THREADS.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_THREADS.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HPX.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HPX.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_SERIAL.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_SERIAL.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HBWSpace.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HBWSpace.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
 endif
+
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp)
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp)
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp)
@@ -1247,6 +1353,9 @@ KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp)
 
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
   KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
+  ifeq ($(KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS), 1)
+    KOKKOS_SRC += $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_CUDA.cpp
+  endif
   KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
   ifneq ($(CUDA_PATH),)
     KOKKOS_CPPLAGS += -I$(CUDA_PATH)/include
diff --git a/packages/kokkos/Makefile.targets b/packages/kokkos/Makefile.targets
index cf9fc242420e1dbbb519b3312cf1a4c3b4354738..93854d0cf150c97d5058422b7ca9ff28ce2ba8b6 100644
--- a/packages/kokkos/Makefile.targets
+++ b/packages/kokkos/Makefile.targets
@@ -48,6 +48,17 @@ Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
 Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
+Lock_Array_CUDA.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_CUDA.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_CUDA.cpp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+Kokkos_SYCL.o : $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL.cpp
+Kokkos_SYCL_Space.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Space.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Space.cpp
+Kokkos_SYCL_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Instance.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Instance.cpp
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
diff --git a/packages/kokkos/README.md b/packages/kokkos/README.md
index d55ef2caac93ae6803aa97925ea7b081d3f05ca3..673f4627125223f0d09f1e184d9942c0e6a0b7ff 100644
--- a/packages/kokkos/README.md
+++ b/packages/kokkos/README.md
@@ -7,7 +7,7 @@ applications targeting all major HPC platforms. For that purpose it provides
 abstractions for both parallel execution of code and data management.
 Kokkos is designed to target complex node architectures with N-level memory
 hierarchies and multiple types of execution resources. It currently can use
-CUDA, HPX, OpenMP and Pthreads as backend programming models with several other
+CUDA, HIP, SYCL, HPX, OpenMP and C++ threads as backend programming models with several other
 backends in development.
 
 Kokkos Core is part of the Kokkos C++ Performance Portability Programming EcoSystem,
@@ -16,29 +16,19 @@ profiling and debugging tools (https://github.com/kokkos/kokkos-tools).
 
 # Learning about Kokkos
 
-A programming guide can be found on the Wiki, the API reference is under development.
+The best way to start learning about Kokkos is going through the Kokkos Lectures.
+They are online available at https://kokkos.link/the-lectures and contain a mix
+of lecture videos and hands-on exercises covering all the important Kokkos Ecosystem
+capabilities.
+
+A programming guide and API reference can be found on the Wiki
+(https://github.com/kokkos/kokkos/wiki).
 
 For questions find us on Slack: https://kokkosteam.slack.com or open a github issue.
 
 For non-public questions send an email to
 crtrott(at)sandia.gov
 
-A separate repository with extensive tutorial material can be found under
-https://github.com/kokkos/kokkos-tutorials.
-
-Furthermore, the 'example/tutorial' directory provides step by step tutorial
-examples which explain many of the features of Kokkos. They work with
-simple Makefiles. To build with g++ and OpenMP simply type 'make'
-in the 'example/tutorial' directory. This will build all examples in the
-subfolders. To change the build options refer to the Programming Guide
-in the compilation section.
-
-To learn more about Kokkos consider watching one of our presentations:
-* GTC 2015:
-  - http://on-demand.gputechconf.com/gtc/2015/video/S5166.html
-  - http://on-demand.gputechconf.com/gtc/2015/presentation/S5166-H-Carter-Edwards.pdf
-
-
 # Contributing to Kokkos
 
 We are open and try to encourage contributions from external developers.
@@ -53,57 +43,40 @@ For specifics see the LICENSE file contained in the repository or distribution.
 
 # Requirements
 
-### Primary tested compilers on X86 are:
-* GCC 5.3.0
-* GCC 5.4.0
-* GCC 5.5.0
-* GCC 6.1.0
-* GCC 7.2.0
-* GCC 7.3.0
-* GCC 8.1.0
-* Intel 17.0.1
-* Intel 17.4.196
-* Intel 18.2.128
-* Clang 4.0.0
-* Clang 6.0.0 for CUDA (CUDA Toolkit 9.0)
-* Clang 7.0.0 for CUDA (CUDA Toolkit 9.1)
-* Clang 8.0.0 for CUDA (CUDA Toolkit 9.2)
-* PGI 18.7
-* NVCC 9.1 for CUDA (with gcc 6.1.0)
-* NVCC 9.2 for CUDA (with gcc 7.2.0)
-* NVCC 10.0 for CUDA (with gcc 7.4.0)
-* NVCC 10.1 for CUDA (with gcc 7.4.0)
-* NVCC 11.0 for CUDA (with gcc 8.4.0)
-
-### Primary tested compilers on Power 8 are:
-* GCC 6.4.0 (OpenMP,Serial)
-* GCC 7.2.0 (OpenMP,Serial)
-* IBM XL 16.1.0 (OpenMP, Serial)
-* NVCC 9.2.88 for CUDA (with gcc 7.2.0 and XL 16.1.0)
-
-### Primary tested compilers on Intel KNL are:
-* Intel 17.2.174 (with gcc 6.2.0 and 6.4.0)
-* Intel 18.2.199 (with gcc 6.2.0 and 6.4.0)
-
-### Primary tested compilers on ARM (Cavium ThunderX2)
-* GCC 7.2.0
-* ARM/Clang 18.4.0
-
-### Other compilers working:
-* X86:
-    * Cygwin 2.1.0 64bit with gcc 4.9.3
-    * GCC 8.1.0 (not warning free)
-
-### Known non-working combinations:
-* Power8:
-    * Pthreads backend
-* ARM
-    * Pthreads backend
+### Minimum Compiler Versions
+
+Generally Kokkos should work with all compiler versions newer than the minimum.
+However as in all sufficiently complex enough code, we have to work around compiler
+bugs with almost all compilers. So compiler versions we don't test may have issues
+we are unaware off.
+
+* GCC: 5.3.0
+* Clang: 4.0.0
+* Intel: 17.0.1
+* NVCC: 9.2.88
+* NVC++: 21.5
+* ROCM: 4.3
+* MSVC: 19.29
+* IBM XL: 16.1.1
+* Fujitsu: 4.5.0
+* ARM/Clang 20.1
+
+### Primary Tested Compilers
+
+* GCC: 5.3.0, 6.1.0, 7.3.0, 8.3, 9.2, 10.0
+* NVCC: 9.2.88, 10.1, 11.0
+* Clang: 8.0.0, 9.0.0, 10.0.0, 12.0.0
+* Intel 17.4, 18.1, 19.5
+* MSVC: 19.29
+* ARM/Clang: 20.1
+* IBM XL: 16.1.1
+* ROCM: 4.3.0
 
 ### Build system:
-* CMake >= 3.10: required
-* CMake >= 3.13: recommended
+
+* CMake >= 3.16: required
 * CMake >= 3.18: Fortran linkage. This does not affect most mixed Fortran/Kokkos builds. See [build issues](BUILD.md#KnownIssues).
+* CMake >= 3.21.1 for NVC++
 
 Primary tested compiler are passing in release mode
 with warnings as errors. They also are tested with a comprehensive set of
@@ -153,7 +126,6 @@ cmake $srcdir \
   -DCMAKE_INSTALL_PREFIX=$path_to_install \
   -DKokkos_ENABLE_OPENMP=On \
   -DKokkos_ARCH_HSW=On \
-  -DKokkos_ENABLE_HWLOC=On \
   -DKokkos_HWLOC_DIR=$path_to_hwloc
 ````
 then simply type `make install`. The Kokkos CMake package will then be installed in `$path_to_install` to be used by downstream packages.
@@ -212,23 +184,8 @@ where `...` is the unique spec identifying the particular Kokkos configuration a
 Some more details can found in the Kokkos spack [documentation](Spack.md) or the Spack [website](https://spack.readthedocs.io/en/latest).
 
 ## Raw Makefile
-A bash script is provided to generate raw makefiles.
-To install Kokkos as a library create a build directory and run the following
-````bash
-> $KOKKOS_PATH/generate_makefile.bash --prefix=$path_to_install
-````
-Once the Makefile is generated, run:
-````bash
-> make kokkoslib
-> make install
-````
-To additionally run the unit tests:
-````bash
-> make build-test
-> make test
-````
-Run `generate_makefile.bash --help` for more detailed options such as
-changing the device type for which to build.
+
+Raw Makefiles are only supported via inline builds. See below.
 
 ## Inline Builds vs. Installed Package
 For individual projects, it may be preferable to build Kokkos inline rather than link to an installed package.
@@ -268,6 +225,35 @@ more than a single GPU is used by a single process.
 
 If you publish work which mentions Kokkos, please cite the following paper:
 
+````BibTex
+@ARTICLE{9485033,
+  author={Trott, Christian R. and Lebrun-Grandié, Damien and Arndt, Daniel and Ciesko, Jan and Dang, Vinh and Ellingwood, Nathan and Gayatri, Rahulkumar and Harvey, Evan and Hollman, Daisy S. and Ibanez, Dan and Liber, Nevin and Madsen, Jonathan and Miles, Jeff and Poliakoff, David and Powell, Amy and Rajamanickam, Sivasankaran and Simberg, Mikael and Sunderland, Dan and Turcksin, Bruno and Wilke, Jeremiah},
+  journal={IEEE Transactions on Parallel and Distributed Systems},
+  title={Kokkos 3: Programming Model Extensions for the Exascale Era},
+  year={2022},
+  volume={33},
+  number={4},
+  pages={805-817},
+  doi={10.1109/TPDS.2021.3097283}}
+````
+
+If you use more than one Kokkos EcoSystem package, please also cite:
+
+````BibTex
+@ARTICLE{9502936,
+  author={Trott, Christian and Berger-Vergiat, Luc and Poliakoff, David and Rajamanickam, Sivasankaran and Lebrun-Grandie, Damien and Madsen, Jonathan and Al Awar, Nader and Gligoric, Milos and Shipman, Galen and Womeldorff, Geoff},
+  journal={Computing in Science   Engineering},
+  title={The Kokkos EcoSystem: Comprehensive Performance Portability for High Performance Computing},
+  year={2021},
+  volume={23},
+  number={5},
+  pages={10-18},
+  doi={10.1109/MCSE.2021.3098509}}
+````
+
+
+And if you feel generous: feel free to cite the original Kokkos paper which describes most of the basic Kokkos concepts:
+
 ````BibTeX
 @article{CarterEdwards20143202,
   title = "Kokkos: Enabling manycore performance portability through polymorphic memory access patterns ",
diff --git a/packages/kokkos/algorithms/CMakeLists.txt b/packages/kokkos/algorithms/CMakeLists.txt
index 4df76a1dbbd17eb269694b8bd801184ccc02e047..eb54db8a556453f4afed4c95cf20321e8cbe211e 100644
--- a/packages/kokkos/algorithms/CMakeLists.txt
+++ b/packages/kokkos/algorithms/CMakeLists.txt
@@ -5,9 +5,7 @@ KOKKOS_SUBPACKAGE(Algorithms)
 IF (NOT Kokkos_INSTALL_TESTING)
   ADD_SUBDIRECTORY(src)
 ENDIF()
-IF(NOT (KOKKOS_ENABLE_OPENMPTARGET
-        AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR
-             KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)))
+IF(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
   KOKKOS_ADD_TEST_DIRECTORIES(unit_tests)
 ENDIF()
 
diff --git a/packages/kokkos/algorithms/src/Kokkos_Random.hpp b/packages/kokkos/algorithms/src/Kokkos_Random.hpp
index 55ce19971faf576483da1ec66cedea0735bc8c7a..46b8ab87fabfbeabda12beb3ddabf0eb6aab3482 100644
--- a/packages/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/packages/kokkos/algorithms/src/Kokkos_Random.hpp
@@ -447,6 +447,25 @@ struct rand<Generator, unsigned long long> {
   }
 };
 
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+template <class Generator>
+struct rand<Generator, Kokkos::Experimental::half_t> {
+  using half = Kokkos::Experimental::half_t;
+  KOKKOS_INLINE_FUNCTION
+  static half max() { return half(1.0); }
+  KOKKOS_INLINE_FUNCTION
+  static half draw(Generator& gen) { return half(gen.frand()); }
+  KOKKOS_INLINE_FUNCTION
+  static half draw(Generator& gen, const half& range) {
+    return half(gen.frand(float(range)));
+  }
+  KOKKOS_INLINE_FUNCTION
+  static half draw(Generator& gen, const half& start, const half& end) {
+    return half(gen.frand(float(start), float(end)));
+  }
+};
+#endif  // defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+
 template <class Generator>
 struct rand<Generator, float> {
   KOKKOS_INLINE_FUNCTION
@@ -600,7 +619,7 @@ struct Random_XorShift1024_UseCArrayState<Kokkos::Experimental::OpenMPTarget>
 
 template <class ExecutionSpace>
 struct Random_UniqueIndex {
-  using locks_view_type = View<int*, ExecutionSpace>;
+  using locks_view_type = View<int**, ExecutionSpace>;
   KOKKOS_FUNCTION
   static int get_state_idx(const locks_view_type) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
@@ -615,7 +634,7 @@ struct Random_UniqueIndex {
 #ifdef KOKKOS_ENABLE_CUDA
 template <>
 struct Random_UniqueIndex<Kokkos::Cuda> {
-  using locks_view_type = View<int*, Kokkos::Cuda>;
+  using locks_view_type = View<int**, Kokkos::Cuda>;
   KOKKOS_FUNCTION
   static int get_state_idx(const locks_view_type& locks_) {
 #ifdef __CUDA_ARCH__
@@ -625,7 +644,7 @@ struct Random_UniqueIndex<Kokkos::Cuda> {
                  blockDim.x * blockDim.y * blockDim.z +
              i_offset) %
             locks_.extent(0);
-    while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
+    while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
       i += blockDim.x * blockDim.y * blockDim.z;
       if (i >= static_cast<int>(locks_.extent(0))) {
         i = i_offset;
@@ -643,7 +662,7 @@ struct Random_UniqueIndex<Kokkos::Cuda> {
 #ifdef KOKKOS_ENABLE_HIP
 template <>
 struct Random_UniqueIndex<Kokkos::Experimental::HIP> {
-  using locks_view_type = View<int*, Kokkos::Experimental::HIP>;
+  using locks_view_type = View<int**, Kokkos::Experimental::HIP>;
   KOKKOS_FUNCTION
   static int get_state_idx(const locks_view_type& locks_) {
 #ifdef __HIP_DEVICE_COMPILE__
@@ -653,7 +672,7 @@ struct Random_UniqueIndex<Kokkos::Experimental::HIP> {
                  blockDim.x * blockDim.y * blockDim.z +
              i_offset) %
             locks_.extent(0);
-    while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
+    while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
       i += blockDim.x * blockDim.y * blockDim.z;
       if (i >= static_cast<int>(locks_.extent(0))) {
         i = i_offset;
@@ -671,15 +690,15 @@ struct Random_UniqueIndex<Kokkos::Experimental::HIP> {
 #ifdef KOKKOS_ENABLE_SYCL
 template <>
 struct Random_UniqueIndex<Kokkos::Experimental::SYCL> {
-  using locks_view_type = View<int*, Kokkos::Experimental::SYCL>;
+  using locks_view_type = View<int**, Kokkos::Experimental::SYCL>;
   KOKKOS_FUNCTION
   static int get_state_idx(const locks_view_type& locks_) {
-#ifdef KOKKOS_ARCH_INTEL_GEN
+#ifdef KOKKOS_ARCH_INTEL_GPU
     int i = Kokkos::Impl::clock_tic() % locks_.extent(0);
 #else
     int i = 0;
 #endif
-    while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
+    while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
       i = (i + 1) % static_cast<int>(locks_.extent(0));
     }
     return i;
@@ -690,14 +709,14 @@ struct Random_UniqueIndex<Kokkos::Experimental::SYCL> {
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
 template <>
 struct Random_UniqueIndex<Kokkos::Experimental::OpenMPTarget> {
-  using locks_view_type = View<int*, Kokkos::Experimental::OpenMPTarget>;
+  using locks_view_type = View<int**, Kokkos::Experimental::OpenMPTarget>;
   KOKKOS_FUNCTION
   static int get_state_idx(const locks_view_type& locks) {
     const int team_size = omp_get_num_threads();
     int i               = omp_get_team_num() * team_size + omp_get_thread_num();
     const int lock_size = locks.extent_int(0);
 
-    while (Kokkos::atomic_compare_exchange(&locks(i), 0, 1)) {
+    while (Kokkos::atomic_compare_exchange(&locks(i, 0), 0, 1)) {
       i = (i + 1) % lock_size;
     }
     return i;
@@ -856,18 +875,22 @@ template <class DeviceType = Kokkos::DefaultExecutionSpace>
 class Random_XorShift64_Pool {
  private:
   using execution_space = typename DeviceType::execution_space;
-  using locks_type      = View<int*, execution_space>;
-  using state_data_type = View<uint64_t*, DeviceType>;
+  using locks_type      = View<int**, execution_space>;
+  using state_data_type = View<uint64_t**, DeviceType>;
   locks_type locks_;
   state_data_type state_;
   int num_states_;
+  int padding_;
 
  public:
   using generator_type = Random_XorShift64<DeviceType>;
   using device_type    = DeviceType;
 
   KOKKOS_INLINE_FUNCTION
-  Random_XorShift64_Pool() { num_states_ = 0; }
+  Random_XorShift64_Pool() {
+    num_states_ = 0;
+    padding_    = 0;
+  }
   Random_XorShift64_Pool(uint64_t seed) {
     num_states_ = 0;
 
@@ -883,16 +906,22 @@ class Random_XorShift64_Pool {
     locks_      = src.locks_;
     state_      = src.state_;
     num_states_ = src.num_states_;
+    padding_    = src.padding_;
     return *this;
   }
 
   void init(uint64_t seed, int num_states) {
     if (seed == 0) seed = uint64_t(1318319);
-
+    // I only want to pad on CPU like archs (less than 1000 threads). 64 is a
+    // magic number, or random number I just wanted something not too large and
+    // not too small. 64 sounded fine.
+    padding_    = num_states < 1000 ? 64 : 1;
     num_states_ = num_states;
 
-    locks_ = locks_type("Kokkos::Random_XorShift64::locks", num_states_);
-    state_ = state_data_type("Kokkos::Random_XorShift64::state", num_states_);
+    locks_ =
+        locks_type("Kokkos::Random_XorShift64::locks", num_states, padding_);
+    state_ = state_data_type("Kokkos::Random_XorShift64::state", num_states_,
+                             padding_);
 
     typename state_data_type::HostMirror h_state = create_mirror_view(state_);
     typename locks_type::HostMirror h_lock       = create_mirror_view(locks_);
@@ -902,15 +931,15 @@ class Random_XorShift64_Pool {
         gen(seed, 0);
     for (int i = 0; i < 17; i++) gen.rand();
     for (int i = 0; i < num_states_; i++) {
-      int n1     = gen.rand();
-      int n2     = gen.rand();
-      int n3     = gen.rand();
-      int n4     = gen.rand();
-      h_state(i) = (((static_cast<uint64_t>(n1)) & 0xffff) << 00) |
-                   (((static_cast<uint64_t>(n2)) & 0xffff) << 16) |
-                   (((static_cast<uint64_t>(n3)) & 0xffff) << 32) |
-                   (((static_cast<uint64_t>(n4)) & 0xffff) << 48);
-      h_lock(i) = 0;
+      int n1        = gen.rand();
+      int n2        = gen.rand();
+      int n3        = gen.rand();
+      int n4        = gen.rand();
+      h_state(i, 0) = (((static_cast<uint64_t>(n1)) & 0xffff) << 00) |
+                      (((static_cast<uint64_t>(n2)) & 0xffff) << 16) |
+                      (((static_cast<uint64_t>(n3)) & 0xffff) << 32) |
+                      (((static_cast<uint64_t>(n4)) & 0xffff) << 48);
+      h_lock(i, 0) = 0;
     }
     deep_copy(state_, h_state);
     deep_copy(locks_, h_lock);
@@ -920,19 +949,19 @@ class Random_XorShift64_Pool {
   Random_XorShift64<DeviceType> get_state() const {
     const int i =
         Impl::Random_UniqueIndex<execution_space>::get_state_idx(locks_);
-    return Random_XorShift64<DeviceType>(state_(i), i);
+    return Random_XorShift64<DeviceType>(state_(i, 0), i);
   }
 
   // NOTE: state_idx MUST be unique and less than num_states
   KOKKOS_INLINE_FUNCTION
   Random_XorShift64<DeviceType> get_state(const int state_idx) const {
-    return Random_XorShift64<DeviceType>(state_(state_idx), state_idx);
+    return Random_XorShift64<DeviceType>(state_(state_idx, 0), state_idx);
   }
 
   KOKKOS_INLINE_FUNCTION
   void free_state(const Random_XorShift64<DeviceType>& state) const {
-    state_(state.state_idx_) = state.state_;
-    locks_(state.state_idx_) = 0;
+    state_(state.state_idx_, 0) = state.state_;
+    locks_(state.state_idx_, 0) = 0;
   }
 };
 
@@ -1092,14 +1121,15 @@ template <class DeviceType = Kokkos::DefaultExecutionSpace>
 class Random_XorShift1024_Pool {
  private:
   using execution_space = typename DeviceType::execution_space;
-  using locks_type      = View<int*, execution_space>;
-  using int_view_type   = View<int*, DeviceType>;
+  using locks_type      = View<int**, execution_space>;
+  using int_view_type   = View<int**, DeviceType>;
   using state_data_type = View<uint64_t * [16], DeviceType>;
 
   locks_type locks_;
   state_data_type state_;
   int_view_type p_;
   int num_states_;
+  int padding_;
   friend class Random_XorShift1024<DeviceType>;
 
  public:
@@ -1129,15 +1159,21 @@ class Random_XorShift1024_Pool {
     state_      = src.state_;
     p_          = src.p_;
     num_states_ = src.num_states_;
+    padding_    = src.padding_;
     return *this;
   }
 
   inline void init(uint64_t seed, int num_states) {
     if (seed == 0) seed = uint64_t(1318319);
+    // I only want to pad on CPU like archs (less than 1000 threads). 64 is a
+    // magic number, or random number I just wanted something not too large and
+    // not too small. 64 sounded fine.
+    padding_    = num_states < 1000 ? 64 : 1;
     num_states_ = num_states;
-    locks_      = locks_type("Kokkos::Random_XorShift1024::locks", num_states_);
+    locks_ =
+        locks_type("Kokkos::Random_XorShift1024::locks", num_states_, padding_);
     state_ = state_data_type("Kokkos::Random_XorShift1024::state", num_states_);
-    p_     = int_view_type("Kokkos::Random_XorShift1024::p", num_states_);
+    p_ = int_view_type("Kokkos::Random_XorShift1024::p", num_states_, padding_);
 
     typename state_data_type::HostMirror h_state = create_mirror_view(state_);
     typename locks_type::HostMirror h_lock       = create_mirror_view(locks_);
@@ -1158,8 +1194,8 @@ class Random_XorShift1024_Pool {
                         (((static_cast<uint64_t>(n3)) & 0xffff) << 32) |
                         (((static_cast<uint64_t>(n4)) & 0xffff) << 48);
       }
-      h_p(i)    = 0;
-      h_lock(i) = 0;
+      h_p(i, 0)    = 0;
+      h_lock(i, 0) = 0;
     }
     deep_copy(state_, h_state);
     deep_copy(locks_, h_lock);
@@ -1169,20 +1205,20 @@ class Random_XorShift1024_Pool {
   Random_XorShift1024<DeviceType> get_state() const {
     const int i =
         Impl::Random_UniqueIndex<execution_space>::get_state_idx(locks_);
-    return Random_XorShift1024<DeviceType>(state_, p_(i), i);
+    return Random_XorShift1024<DeviceType>(state_, p_(i, 0), i);
   };
 
   // NOTE: state_idx MUST be unique and less than num_states
   KOKKOS_INLINE_FUNCTION
   Random_XorShift1024<DeviceType> get_state(const int state_idx) const {
-    return Random_XorShift1024<DeviceType>(state_, p_(state_idx), state_idx);
+    return Random_XorShift1024<DeviceType>(state_, p_(state_idx, 0), state_idx);
   }
 
   KOKKOS_INLINE_FUNCTION
   void free_state(const Random_XorShift1024<DeviceType>& state) const {
     for (int i = 0; i < 16; i++) state_(state.state_idx_, i) = state.state_[i];
-    p_(state.state_idx_)     = state.p_;
-    locks_(state.state_idx_) = 0;
+    p_(state.state_idx_, 0)     = state.p_;
+    locks_(state.state_idx_, 0) = 0;
   }
 };
 
diff --git a/packages/kokkos/algorithms/src/Kokkos_Sort.hpp b/packages/kokkos/algorithms/src/Kokkos_Sort.hpp
index d17c02776ff5653045d9259d7a4fae2207546e21..7c1ce4c4cd8e757f3018da3989660d3b4c5e4cff 100644
--- a/packages/kokkos/algorithms/src/Kokkos_Sort.hpp
+++ b/packages/kokkos/algorithms/src/Kokkos_Sort.hpp
@@ -319,7 +319,7 @@ class BinSort {
                    Kokkos::RangePolicy<execution_space>(0, len), functor);
     }
 
-    execution_space().fence();
+    execution_space().fence("Kokkos::Sort: fence after sorting");
   }
 
   template <class ValuesViewType>
@@ -492,7 +492,8 @@ bool try_std_sort(ViewType view) {
                       view.stride_3(), view.stride_4(), view.stride_5(),
                       view.stride_6(), view.stride_7()};
   possible         = possible &&
-             std::is_same<typename ViewType::memory_space, HostSpace>::value;
+             SpaceAccessibility<HostSpace,
+                                typename ViewType::memory_space>::accessible;
   possible = possible && (ViewType::Rank == 1);
   possible = possible && (stride[0] == 1);
   if (possible) {
diff --git a/packages/kokkos/algorithms/unit_tests/TestRandom.hpp b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp
index c37e779c9927b21b2add67236124f3821341d968..3dffce7df4f8dd1663a10a871075c1841005138d 100644
--- a/packages/kokkos/algorithms/unit_tests/TestRandom.hpp
+++ b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp
@@ -47,7 +47,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <cstdio>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Random.hpp>
 #include <cmath>
@@ -198,11 +198,50 @@ struct test_random_functor {
           static_cast<uint64_t>(1.0 * HIST_DIM3D * tmp2 / theMax);
       const uint64_t ind3_3d =
           static_cast<uint64_t>(1.0 * HIST_DIM3D * tmp3 / theMax);
-
+// Workaround Intel 17 compiler bug which sometimes add random
+// instruction alignment which makes the lock instruction
+// illegal. Seems to be mostly just for unsigned int atomics.
+// Looking at the assembly the compiler
+// appears to insert cache line alignment for the instruction.
+// Isn't restricted to specific archs. Seen it on SNB and SKX, but for
+// different code. Another occurrence was with Desul atomics in
+// a different unit test. This one here happens without desul atomics.
+// Inserting an assembly nop instruction changes the alignment and
+// works round this.
+//
+// 17.0.4 for 64bit Random works with 1/1/1/2/1
+// 17.0.4 for 1024bit Random works with 1/1/1/1/1
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       atomic_fetch_add(&density_1d(ind1_1d), 1);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       atomic_fetch_add(&density_1d(ind2_1d), 1);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       atomic_fetch_add(&density_1d(ind3_1d), 1);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      if (std::is_same<rnd_type, Kokkos::Random_XorShift64<device_type>>::value)
+        asm volatile("nop\n");
+      asm volatile("nop\n");
+#endif
+#endif
       atomic_fetch_add(&density_3d(ind1_3d, ind2_3d, ind3_3d), 1);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
     }
     rand_pool.free_state(rand_gen);
   }
@@ -338,9 +377,11 @@ struct test_random_scalar {
       using functor_type =
           test_histogram1d_functor<typename RandomGenerator::device_type>;
       parallel_reduce(HIST_DIM1D, functor_type(density_1d, num_draws), result);
-
-      double tolerance   = 6 * std::sqrt(1.0 / HIST_DIM1D);
-      double mean_expect = 1.0 * num_draws * 3 / HIST_DIM1D;
+      double mean_eps_expect       = 0.0001;
+      double variance_eps_expect   = 0.07;
+      double covariance_eps_expect = 0.06;
+      double tolerance             = 6 * std::sqrt(1.0 / HIST_DIM1D);
+      double mean_expect           = 1.0 * num_draws * 3 / HIST_DIM1D;
       double variance_expect =
           1.0 * num_draws * 3 / HIST_DIM1D * (1.0 - 1.0 / HIST_DIM1D);
       double covariance_expect = -1.0 * num_draws * 3 / HIST_DIM1D / HIST_DIM1D;
@@ -349,11 +390,26 @@ struct test_random_scalar {
           variance_expect / (result.variance / HIST_DIM1D) - 1.0;
       double covariance_eps =
           (result.covariance / HIST_DIM1D - covariance_expect) / mean_expect;
-      pass_hist1d_mean = ((-0.0001 < mean_eps) && (0.0001 > mean_eps)) ? 1 : 0;
-      pass_hist1d_var =
-          ((-0.07 < variance_eps) && (0.07 > variance_eps)) ? 1 : 0;
-      pass_hist1d_covar =
-          ((-0.06 < covariance_eps) && (0.06 > covariance_eps)) ? 1 : 0;
+
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+      if (std::is_same<Scalar, Kokkos::Experimental::half_t>::value) {
+        mean_eps_expect       = 0.0003;
+        variance_eps_expect   = 1.0;
+        covariance_eps_expect = 5.0e4;
+      }
+#endif
+
+      pass_hist1d_mean =
+          ((-mean_eps_expect < mean_eps) && (mean_eps_expect > mean_eps)) ? 1
+                                                                          : 0;
+      pass_hist1d_var = ((-variance_eps_expect < variance_eps) &&
+                         (variance_eps_expect > variance_eps))
+                            ? 1
+                            : 0;
+      pass_hist1d_covar = ((-covariance_eps_expect < covariance_eps) &&
+                           (covariance_eps_expect > covariance_eps))
+                              ? 1
+                              : 0;
 
       cout << "Density 1D: " << mean_eps << " " << variance_eps << " "
            << (result.covariance / HIST_DIM1D / HIST_DIM1D) << " || "
@@ -371,8 +427,9 @@ struct test_random_scalar {
           test_histogram3d_functor<typename RandomGenerator::device_type>;
       parallel_reduce(HIST_DIM1D, functor_type(density_3d, num_draws), result);
 
-      double tolerance   = 6 * std::sqrt(1.0 / HIST_DIM1D);
-      double mean_expect = 1.0 * num_draws / HIST_DIM1D;
+      double variance_factor = 1.2;
+      double tolerance       = 6 * std::sqrt(1.0 / HIST_DIM1D);
+      double mean_expect     = 1.0 * num_draws / HIST_DIM1D;
       double variance_expect =
           1.0 * num_draws / HIST_DIM1D * (1.0 - 1.0 / HIST_DIM1D);
       double covariance_expect = -1.0 * num_draws / HIST_DIM1D / HIST_DIM1D;
@@ -381,15 +438,23 @@ struct test_random_scalar {
           variance_expect / (result.variance / HIST_DIM1D) - 1.0;
       double covariance_eps =
           (result.covariance / HIST_DIM1D - covariance_expect) / mean_expect;
+
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+      if (std::is_same<Scalar, Kokkos::Experimental::half_t>::value) {
+        variance_factor = 7;
+      }
+#endif
+
       pass_hist3d_mean =
           ((-tolerance < mean_eps) && (tolerance > mean_eps)) ? 1 : 0;
-      pass_hist3d_var = ((-1.2 * tolerance < variance_eps) &&
-                         (1.2 * tolerance > variance_eps))
+      pass_hist3d_var = ((-variance_factor * tolerance < variance_eps) &&
+                         (variance_factor * tolerance > variance_eps))
                             ? 1
                             : 0;
-      pass_hist3d_covar =
-          ((-tolerance < covariance_eps) && (tolerance > covariance_eps)) ? 1
-                                                                          : 0;
+      pass_hist3d_covar = ((-variance_factor * tolerance < covariance_eps) &&
+                           (variance_factor * tolerance > covariance_eps))
+                              ? 1
+                              : 0;
 
       cout << "Density 3D: " << mean_eps << " " << variance_eps << " "
            << result.covariance / HIST_DIM1D / HIST_DIM1D << " || " << tolerance
@@ -471,6 +536,21 @@ void test_random(unsigned int num_draws) {
   deep_copy(density_1d, 0);
   deep_copy(density_3d, 0);
 
+  cout << "Test Scalar=half" << endl;
+  test_random_scalar<RandomGenerator, Kokkos::Experimental::half_t> test_half(
+      density_1d, density_3d, pool, num_draws);
+  ASSERT_EQ(test_half.pass_mean, 1);
+  ASSERT_EQ(test_half.pass_var, 1);
+  ASSERT_EQ(test_half.pass_covar, 1);
+  ASSERT_EQ(test_half.pass_hist1d_mean, 1);
+  ASSERT_EQ(test_half.pass_hist1d_var, 1);
+  ASSERT_EQ(test_half.pass_hist1d_covar, 1);
+  ASSERT_EQ(test_half.pass_hist3d_mean, 1);
+  ASSERT_EQ(test_half.pass_hist3d_var, 1);
+  ASSERT_EQ(test_half.pass_hist3d_covar, 1);
+  deep_copy(density_1d, 0);
+  deep_copy(density_3d, 0);
+
   cout << "Test Scalar=float" << endl;
   test_random_scalar<RandomGenerator, float> test_float(density_1d, density_3d,
                                                         pool, num_draws);
diff --git a/packages/kokkos/appveyor.yml b/packages/kokkos/appveyor.yml
index e8763c0b665c4a992f74b70eab0caa915beb33dd..73a0d3187596be4d4c99ef9f211b93bd0659079e 100644
--- a/packages/kokkos/appveyor.yml
+++ b/packages/kokkos/appveyor.yml
@@ -3,4 +3,8 @@ image:
 clone_folder: c:\projects\source
 build_script:
 - cmd: >-
-    cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc /d1reportClassLayoutChanges" -DCTEST_ARGS="-C Debug -V --output-on-failure" -DBUILD_NAME=MSVC-2019 -DBUILD_TYPE=Debug -DSITE=AppVeyor -DTARGET=install -P cmake/KokkosCI.cmake
+    mkdir build &&
+    cd build &&
+    cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc" -DKokkos_ENABLE_DEPRECATED_CODE_3=ON -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF &&
+    cmake --build . --target install &&
+    ctest -C Debug --output-on-failure
diff --git a/packages/kokkos/benchmarks/atomic/main.cpp b/packages/kokkos/benchmarks/atomic/main.cpp
index 7b5caa1aee1658104a8916bec314759e3e5ba30a..cc0d3e41e85aaa7483d11dbf639cc5a9d5809a47 100644
--- a/packages/kokkos/benchmarks/atomic/main.cpp
+++ b/packages/kokkos/benchmarks/atomic/main.cpp
@@ -1,12 +1,12 @@
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <Kokkos_Random.hpp>
 
 template <class Scalar>
 double test_atomic(int L, int N, int M, int K, int R,
                    Kokkos::View<const int*> offsets) {
   Kokkos::View<Scalar*> output("Output", N);
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
 
   for (int r = 0; r < R; r++)
     Kokkos::parallel_for(
@@ -28,7 +28,7 @@ template <class Scalar>
 double test_no_atomic(int L, int N, int M, int K, int R,
                       Kokkos::View<const int*> offsets) {
   Kokkos::View<Scalar*> output("Output", N);
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
   for (int r = 0; r < R; r++)
     Kokkos::parallel_for(
         L, KOKKOS_LAMBDA(const int& i) {
diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp b/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp
index 62d7ef4a4cf387191c6d0276c4ea360c289d4de5..4fc6ca2c68b3a77e37360b90b678ae5c461204f6 100644
--- a/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp
+++ b/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 template <class Scalar, int Unroll, int Stride>
 struct Run {
diff --git a/packages/kokkos/benchmarks/bytes_and_flops/main.cpp b/packages/kokkos/benchmarks/bytes_and_flops/main.cpp
index 6da2407a08b7afb981ffb8c3a970b2df7d55f951..75f30a340938378fe85f72b4dd294235594cf21d 100644
--- a/packages/kokkos/benchmarks/bytes_and_flops/main.cpp
+++ b/packages/kokkos/benchmarks/bytes_and_flops/main.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <bench.hpp>
 #include <cstdlib>
 
diff --git a/packages/kokkos/benchmarks/gather/main.cpp b/packages/kokkos/benchmarks/gather/main.cpp
index 5f10e4dcc1aa509c191d3c7a6486114b3c0b7de9..dd502faaa480c1c7ab9936e4f032095094e714bb 100644
--- a/packages/kokkos/benchmarks/gather/main.cpp
+++ b/packages/kokkos/benchmarks/gather/main.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <gather.hpp>
 #include <cstdlib>
 
diff --git a/packages/kokkos/benchmarks/stream/stream-kokkos.cpp b/packages/kokkos/benchmarks/stream/stream-kokkos.cpp
index e7ef67e0805c9c4424fbf1781423cf907dca3eec..311947c197cba64e8354dc63743baa99b0bfa782 100644
--- a/packages/kokkos/benchmarks/stream/stream-kokkos.cpp
+++ b/packages/kokkos/benchmarks/stream/stream-kokkos.cpp
@@ -52,35 +52,33 @@
 
 #define HLINE "-------------------------------------------------------------\n"
 
-#if defined(KOKKOS_ENABLE_CUDA)
-using StreamHostArray   = Kokkos::View<double*, Kokkos::CudaSpace>::HostMirror;
-using StreamDeviceArray = Kokkos::View<double*, Kokkos::CudaSpace>;
-#else
-using StreamHostArray   = Kokkos::View<double*, Kokkos::HostSpace>::HostMirror;
-using StreamDeviceArray = Kokkos::View<double*, Kokkos::HostSpace>;
-#endif
+using StreamDeviceArray =
+    Kokkos::View<double*, Kokkos::MemoryTraits<Kokkos::Restrict>>;
+using StreamHostArray = typename StreamDeviceArray::HostMirror;
 
 using StreamIndex = int;
+using Policy      = Kokkos::RangePolicy<Kokkos::IndexType<StreamIndex>>;
 
-double now() {
-  struct timeval now;
-  gettimeofday(&now, nullptr);
+void perform_set(StreamDeviceArray& a, const double scalar) {
+  Kokkos::parallel_for(
+      "set", Policy(0, a.extent(0)),
+      KOKKOS_LAMBDA(const StreamIndex i) { a[i] = scalar; });
 
-  return (double)now.tv_sec + ((double)now.tv_usec * 1.0e-6);
+  Kokkos::fence();
 }
 
-void perform_copy(StreamDeviceArray& a, StreamDeviceArray& b,
-                  StreamDeviceArray& c) {
+void perform_copy(StreamDeviceArray& a, StreamDeviceArray& b) {
   Kokkos::parallel_for(
-      "copy", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) { c[i] = a[i]; });
+      "copy", Policy(0, a.extent(0)),
+      KOKKOS_LAMBDA(const StreamIndex i) { b[i] = a[i]; });
 
   Kokkos::fence();
 }
 
-void perform_scale(StreamDeviceArray& a, StreamDeviceArray& b,
-                   StreamDeviceArray& c, const double scalar) {
+void perform_scale(StreamDeviceArray& b, StreamDeviceArray& c,
+                   const double scalar) {
   Kokkos::parallel_for(
-      "copy", a.extent(0),
+      "scale", Policy(0, b.extent(0)),
       KOKKOS_LAMBDA(const StreamIndex i) { b[i] = scalar * c[i]; });
 
   Kokkos::fence();
@@ -89,7 +87,7 @@ void perform_scale(StreamDeviceArray& a, StreamDeviceArray& b,
 void perform_add(StreamDeviceArray& a, StreamDeviceArray& b,
                  StreamDeviceArray& c) {
   Kokkos::parallel_for(
-      "add", a.extent(0),
+      "add", Policy(0, a.extent(0)),
       KOKKOS_LAMBDA(const StreamIndex i) { c[i] = a[i] + b[i]; });
 
   Kokkos::fence();
@@ -98,7 +96,7 @@ void perform_add(StreamDeviceArray& a, StreamDeviceArray& b,
 void perform_triad(StreamDeviceArray& a, StreamDeviceArray& b,
                    StreamDeviceArray& c, const double scalar) {
   Kokkos::parallel_for(
-      "triad", a.extent(0),
+      "triad", Policy(0, a.extent(0)),
       KOKKOS_LAMBDA(const StreamIndex i) { a[i] = b[i] + scalar * c[i]; });
 
   Kokkos::fence();
@@ -184,6 +182,7 @@ int run_benchmark() {
 
   const double scalar = 3.0;
 
+  double setTime   = std::numeric_limits<double>::max();
   double copyTime  = std::numeric_limits<double>::max();
   double scaleTime = std::numeric_limits<double>::max();
   double addTime   = std::numeric_limits<double>::max();
@@ -191,13 +190,10 @@ int run_benchmark() {
 
   printf("Initializing Views...\n");
 
-#if defined(KOKKOS_HAVE_OPENMP)
-  Kokkos::parallel_for(
-      "init", Kokkos::RangePolicy<Kokkos::OpenMP>(0, STREAM_ARRAY_SIZE),
-#else
   Kokkos::parallel_for(
-      "init", Kokkos::RangePolicy<Kokkos::Serial>(0, STREAM_ARRAY_SIZE),
-#endif
+      "init",
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0,
+                                                             STREAM_ARRAY_SIZE),
       KOKKOS_LAMBDA(const int i) {
         a[i] = 1.0;
         b[i] = 2.0;
@@ -209,26 +205,30 @@ int run_benchmark() {
   Kokkos::deep_copy(dev_b, b);
   Kokkos::deep_copy(dev_c, c);
 
-  double start;
-
   printf("Starting benchmarking...\n");
 
+  Kokkos::Timer timer;
+
   for (StreamIndex k = 0; k < STREAM_NTIMES; ++k) {
-    start = now();
-    perform_copy(dev_a, dev_b, dev_c);
-    copyTime = std::min(copyTime, (now() - start));
+    timer.reset();
+    perform_set(dev_c, 1.5);
+    setTime = std::min(setTime, timer.seconds());
+
+    timer.reset();
+    perform_copy(dev_a, dev_c);
+    copyTime = std::min(copyTime, timer.seconds());
 
-    start = now();
-    perform_scale(dev_a, dev_b, dev_c, scalar);
-    scaleTime = std::min(scaleTime, (now() - start));
+    timer.reset();
+    perform_scale(dev_b, dev_c, scalar);
+    scaleTime = std::min(scaleTime, timer.seconds());
 
-    start = now();
+    timer.reset();
     perform_add(dev_a, dev_b, dev_c);
-    addTime = std::min(addTime, (now() - start));
+    addTime = std::min(addTime, timer.seconds());
 
-    start = now();
+    timer.reset();
     perform_triad(dev_a, dev_b, dev_c, scalar);
-    triadTime = std::min(triadTime, (now() - start));
+    triadTime = std::min(triadTime, timer.seconds());
   }
 
   Kokkos::deep_copy(a, dev_a);
@@ -240,6 +240,9 @@ int run_benchmark() {
 
   printf(HLINE);
 
+  printf("Set             %11.2f MB/s\n",
+         (1.0e-06 * 1.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) /
+             setTime);
   printf("Copy            %11.2f MB/s\n",
          (1.0e-06 * 2.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) /
              copyTime);
diff --git a/packages/kokkos/bin/hpcbind b/packages/kokkos/bin/hpcbind
index 6af091a7d8b60766cddae67c6076b5df1f8ad12f..43f8a745da27c080ce54ac4cfd9b9358f618554f 100755
--- a/packages/kokkos/bin/hpcbind
+++ b/packages/kokkos/bin/hpcbind
@@ -634,15 +634,15 @@ elif [[ ${HPCBIND_HAS_COMMAND} -eq 1 ]]; then
   > ${HPCBIND_OUT}
   if [[ ${HPCBIND_TEE} -eq 0 ]]; then
     if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
-      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
+      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- "$@" > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
     else
-      eval $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
+      eval "$@" > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
     fi
   else
     if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
-      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
+      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- "$@" > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
     else
-      eval $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
+      eval "$@" > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
     fi
   fi
 fi
diff --git a/packages/kokkos/bin/nvcc_wrapper b/packages/kokkos/bin/nvcc_wrapper
index 4e52e4d09f4f86668ccd322b8ab2fe1093f31996..ba2c55508aca0c139853ce7107c53f8a405ec9c3 100755
--- a/packages/kokkos/bin/nvcc_wrapper
+++ b/packages/kokkos/bin/nvcc_wrapper
@@ -96,7 +96,7 @@ replace_pragma_ident=0
 first_xcompiler_arg=1
 
 # Allow for setting temp dir without setting TMPDIR in parent (see https://docs.olcf.ornl.gov/systems/summit_user_guide.html#setting-tmpdir-causes-jsm-jsrun-errors-job-state-flip-flop)
-if [[ ! -z ${NVCC_WRAPPER_TMPDIR+x} ]]; then
+if [[ -z ${NVCC_WRAPPER_TMPDIR+x} ]]; then
   temp_dir=${TMPDIR:-/tmp}
 else
   temp_dir=${NVCC_WRAPPER_TMPDIR+x}
@@ -226,14 +226,14 @@ do
     cuda_args="$cuda_args $1"
     ;;
   #Handle more known nvcc args
-  --expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets)
+  --expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets|-allow-unsupported-compiler|--allow-unsupported-compiler)
     cuda_args="$cuda_args $1"
     ;;
   #Handle known nvcc args that have an argument
-  -maxrregcount=*|--maxrregcount=*)
+  -maxrregcount=*|--maxrregcount=*|-time=*)
     cuda_args="$cuda_args $1"
     ;;
-  -maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart|-include)
+  -maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart|-include|-time)
     cuda_args="$cuda_args $1 $2"
     shift
     ;;
@@ -552,14 +552,14 @@ if [ $host_only -eq 1 ]; then
   $host_command
 elif [ -n "$nvcc_depfile_command" ]; then
   if [ "$NVCC_WRAPPER_SHOW_COMMANDS_BEING_RUN" == "1" ] ; then
-    echo "$nvcc_command && $nvcc_depfile_command"
+    echo "TMPDIR=${temp_dir} $nvcc_command && TMPDIR=${temp_dir} $nvcc_depfile_command"
   fi
-  $nvcc_command && $nvcc_depfile_command
+  TMPDIR=${temp_dir} $nvcc_command && TMPDIR=${temp_dir} $nvcc_depfile_command
 else
   if [ "$NVCC_WRAPPER_SHOW_COMMANDS_BEING_RUN" == "1" ] ; then
-    echo "$nvcc_command"
+    echo "TMPDIR=${temp_dir} $nvcc_command"
   fi
-  $nvcc_command
+  TMPDIR=${temp_dir} $nvcc_command
 fi
 error_code=$?
 
diff --git a/packages/kokkos/cmake/CTestConfig.cmake.in b/packages/kokkos/cmake/CTestConfig.cmake.in
deleted file mode 100644
index 1f82c0d64d15e0a4fb346cfb7227be9cd41e5f17..0000000000000000000000000000000000000000
--- a/packages/kokkos/cmake/CTestConfig.cmake.in
+++ /dev/null
@@ -1,91 +0,0 @@
-#----------------------------------------------------------------------------------------#
-#
-#   CTestConfig.cmake template for Kokkos
-#
-#----------------------------------------------------------------------------------------#
-
-#
-#   dash-board related
-#
-set(CTEST_PROJECT_NAME "Kokkos")
-set(CTEST_NIGHTLY_START_TIME "01:00:00 UTC")
-set(CTEST_DROP_METHOD "https")
-set(CTEST_DROP_SITE "cdash.nersc.gov")
-set(CTEST_DROP_LOCATION "/submit.php?project=${CTEST_PROJECT_NAME}")
-set(CTEST_CDASH_VERSION "1.6")
-set(CTEST_CDASH_QUERY_VERSION TRUE)
-set(CTEST_SUBMIT_RETRY_COUNT "1")
-set(CTEST_SUBMIT_RETRY_DELAY "30")
-
-#
-#   configure/build related
-#
-set(CTEST_BUILD_NAME "@BUILD_NAME@")
-set(CTEST_MODEL "@MODEL@")
-set(CTEST_SITE "@SITE@")
-set(CTEST_CONFIGURATION_TYPE "@BUILD_TYPE@")
-set(CTEST_SOURCE_DIRECTORY "@SOURCE_REALDIR@")
-set(CTEST_BINARY_DIRECTORY "@BINARY_REALDIR@")
-
-#
-#   configure/build related
-#
-set(CTEST_UPDATE_TYPE "git")
-set(CTEST_UPDATE_VERSION_ONLY ON)
-# set(CTEST_GENERATOR "")
-# set(CTEST_GENERATOR_PLATFORM "")
-
-#
-#   testing related
-#
-set(CTEST_TIMEOUT "7200")
-set(CTEST_TEST_TIMEOUT "7200")
-set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS "100")
-set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "100")
-set(CTEST_CUSTOM_MAXIMUM_PASSED_TEST_OUTPUT_SIZE "1048576")
-
-#
-#   coverage related
-#
-set(CTEST_CUSTOM_COVERAGE_EXCLUDE ".*tpls/.*;/usr/.*;.*unit_test/.*;.*unit_tests/.*;.*perf_test/.*")
-
-#
-#   commands
-#
-if(NOT "@CHECKOUT_COMMAND@" STREQUAL "")
-    set(CTEST_CHECKOUT_COMMAND "@CHECKOUT_COMMAND@")
-endif()
-set(CTEST_UPDATE_COMMAND "@GIT_EXECUTABLE@")
-set(CTEST_CONFIGURE_COMMAND "@CMAKE_COMMAND@ -DCMAKE_BUILD_TYPE=@BUILD_TYPE@ -DKokkos_ENABLE_TESTS=ON @CONFIG_ARGS@ @SOURCE_REALDIR@")
-set(CTEST_BUILD_COMMAND "@CMAKE_COMMAND@ --build @BINARY_REALDIR@ --target @TARGET@")
-if(NOT WIN32)
-    set(CTEST_BUILD_COMMAND "${CTEST_BUILD_COMMAND} -- -j@BUILD_JOBS@")
-endif()
-set(CTEST_COVERAGE_COMMAND "gcov")
-set(CTEST_MEMORYCHECK_COMMAND "valgrind")
-set(CTEST_GIT_COMMAND "@GIT_EXECUTABLE@")
-
-#
-#   various configs
-#
-set(APPEND_VALUE @APPEND@)
-if(APPEND_VALUE)
-    set(APPEND_CTEST APPEND)
-endif()
-
-macro(SET_TEST_PROP VAR)
-    if(NOT "${ARGS}" STREQUAL "")
-        set(${VAR}_CTEST ${VAR} ${ARGN})
-    endif()
-endmacro()
-
-set_test_prop(START           @START@)
-set_test_prop(END             @END@)
-set_test_prop(STRIDE          @STRIDE@)
-set_test_prop(INCLUDE         @INCLUDE@)
-set_test_prop(EXCLUDE         @EXCLUDE@)
-set_test_prop(INCLUDE_LABEL   @INCLUDE_LABEL@)
-set_test_prop(EXCLUDE_LABEL   @EXCLUDE_LABEL@)
-set_test_prop(PARALLEL_LEVEL  @PARALLEL_LEVEL@)
-set_test_prop(STOP_TIME       @STOP_TIME@)
-set_test_prop(COVERAGE_LABELS @LABELS@)
diff --git a/packages/kokkos/cmake/KokkosCI.cmake b/packages/kokkos/cmake/KokkosCI.cmake
deleted file mode 100644
index e8c9af37ad544a93a62f498e9a903696321a1c75..0000000000000000000000000000000000000000
--- a/packages/kokkos/cmake/KokkosCI.cmake
+++ /dev/null
@@ -1,350 +0,0 @@
-cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
-
-message(STATUS "")
-
-get_cmake_property(_cached_vars CACHE_VARIABLES)
-set(KOKKOS_CMAKE_ARGS)
-set(EXCLUDED_VARIABLES "CMAKE_COMMAND" "CMAKE_CPACK_COMMAND" "CMAKE_CTEST_COMMAND" "CMAKE_ROOT"
-                       "CTEST_ARGS" "BUILD_NAME" "CMAKE_CXX_FLAGS" "CMAKE_BUILD_TYPE")
-list(SORT _cached_vars)
-foreach(_var ${_cached_vars})
-    if(NOT "${_var}" IN_LIST EXCLUDED_VARIABLES)
-        list(APPEND KOKKOS_CMAKE_ARGS ${_var})
-        if("${_var}" STREQUAL "CMAKE_BUILD_TYPE")
-            set(BUILD_TYPE "${CMAKE_BUILD_TYPE}")
-        endif()
-    endif()
-endforeach()
-
-
-#----------------------------------------------------------------------------------------#
-#
-#   Macros and variables
-#
-#----------------------------------------------------------------------------------------#
-
-macro(CHECK_REQUIRED VAR)
-    if(NOT DEFINED ${VAR})
-        message(FATAL_ERROR "Error! Variable '${VAR}' must be defined")
-    endif()
-endmacro()
-
-# require the build name variable
-CHECK_REQUIRED(BUILD_NAME)
-
-# uses all args
-macro(SET_DEFAULT VAR)
-    if(NOT DEFINED ${VAR})
-        set(${VAR} ${ARGN})
-    endif()
-    # remove these ctest configuration variables from the defines
-    # passed to the Kokkos configuration
-    if("${VAR}" IN_LIST KOKKOS_CMAKE_ARGS)
-        list(REMOVE_ITEM KOKKOS_CMAKE_ARGS "${VAR}")
-    endif()
-endmacro()
-
-# uses first arg -- useful for selecting via priority from multiple
-# potentially defined variables, e.g.:
-#
-#   set_default_arg1(BUILD_NAME ${TRAVIS_BUILD_NAME} ${BUILD_NAME})
-#
-macro(SET_DEFAULT_ARG1 VAR)
-    if(NOT DEFINED ${VAR})
-        foreach(_ARG ${ARGN})
-            if(NOT "${_ARG}" STREQUAL "")
-                set(${VAR} ${_ARG})
-                break()
-            endif()
-        endforeach()
-    endif()
-    # remove these ctest configuration variables from the defines
-    # passed to the Kokkos configuration
-    if("${VAR}" IN_LIST KOKKOS_CMAKE_ARGS)
-        list(REMOVE_ITEM KOKKOS_CMAKE_ARGS "${VAR}")
-    endif()
-endmacro()
-
-# determine the default working directory
-if(NOT "$ENV{WORKSPACE}" STREQUAL "")
-    set(WORKING_DIR "$ENV{WORKSPACE}")
-else()
-    get_filename_component(WORKING_DIR ${CMAKE_CURRENT_LIST_DIR} DIRECTORY)
-endif()
-
-# determine the hostname
-execute_process(COMMAND hostname
-    OUTPUT_VARIABLE HOSTNAME
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-SET_DEFAULT(HOSTNAME "$ENV{HOSTNAME}")
-
-# get the number of processors
-include(ProcessorCount)
-ProcessorCount(NUM_PROCESSORS)
-
-# find git
-find_package(Git QUIET)
-if(NOT GIT_EXECUTABLE)
-    unset(GIT_EXECUTABLE CACHE)
-    unset(GIT_EXECUTABLE)
-endif()
-
-function(EXECUTE_GIT_COMMAND VAR)
-    set(${VAR} "" PARENT_SCOPE)
-    execute_process(COMMAND ${GIT_EXECUTABLE} ${ARGN}
-        OUTPUT_VARIABLE VAL
-        RESULT_VARIABLE RET
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-        WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
-        ERROR_QUIET)
-    string(REPLACE ";" " " _CMD "${GIT_EXECUTABLE} ${ARGN}")
-    set(LAST_GIT_COMMAND "${_CMD}" PARENT_SCOPE)
-    if(RET EQUAL 0)
-        set(${VAR} "${VAL}" PARENT_SCOPE)
-    endif()
-endfunction()
-
-# just gets the git branch name if available
-function(GET_GIT_BRANCH_NAME VAR)
-    execute_git_command(GIT_BRANCH branch --show-current)
-    set(_INVALID "%D" "HEAD")
-    if(NOT GIT_BRANCH OR "${GIT_BRANCH}" IN_LIST _INVALID)
-        execute_git_command(GIT_BRANCH show -s --format=%D)
-        if(NOT GIT_BRANCH OR "${GIT_BRANCH}" IN_LIST _INVALID)
-            execute_git_command(GIT_BRANCH --describe all)
-        endif()
-    endif()
-    #
-    if(GIT_BRANCH)
-        string(REPLACE " " ";" _DESC "${GIT_BRANCH}")
-        # just set it to last one via loop instead of wonky cmake index manip
-        foreach(_ITR ${_DESC})
-            set(GIT_BRANCH "${_ITR}")
-        endforeach()
-        set(${VAR} "${GIT_BRANCH}" PARENT_SCOPE)
-        message(STATUS "GIT BRANCH via '${LAST_GIT_COMMAND}': ${GIT_BRANCH}")
-    endif()
-endfunction()
-
-# just gets the git branch name if available
-function(GET_GIT_AUTHOR_NAME VAR)
-    execute_git_command(GIT_AUTHOR show -s --format=%an)
-    if(GIT_AUTHOR)
-        string(LENGTH "${GIT_AUTHOR}" STRLEN)
-        # if the build name gets too long, this can cause submission errors
-        if(STRLEN GREATER 24)
-            # remove middle initial
-            string(REGEX REPLACE " [A-Z]\. " " " GIT_AUTHOR "${GIT_AUTHOR}")
-            # get first and sur name
-            string(REGEX REPLACE "([A-Za-z]+) ([A-Za-z]+)" "\\1" F_NAME "${GIT_AUTHOR}")
-            string(REGEX REPLACE "([A-Za-z]+) ([A-Za-z]+)" "\\2" S_NAME "${GIT_AUTHOR}")
-            if(S_NAME)
-                set(GIT_AUTHOR "${S_NAME}")
-            elseif(F_NAME)
-                set(GIT_AUTHOR "${F_NAME}")
-            endif()
-        endif()
-        # remove any spaces, quotes, periods, etc.
-        string(REGEX REPLACE "[ ',;_\.\"]+" "" GIT_AUTHOR "${GIT_AUTHOR}")
-        set(${VAR} "${GIT_AUTHOR}" PARENT_SCOPE)
-        message(STATUS "GIT AUTHOR via '${LAST_GIT_COMMAND}': ${GIT_AUTHOR}")
-    endif()
-endfunction()
-
-# get the name of the branch
-GET_GIT_BRANCH_NAME(GIT_BRANCH)
-# get the name of the author
-GET_GIT_AUTHOR_NAME(GIT_AUTHOR)
-# author, prefer git method for consistency
-SET_DEFAULT_ARG1(AUTHOR ${GIT_AUTHOR} $ENV{GIT_AUTHOR} $ENV{AUTHOR})
-# SLUG == owner_name/repo_name
-SET_DEFAULT_ARG1(SLUG $ENV{TRAVIS_PULL_REQUEST_SLUG} $ENV{TRAVIS_REPO_SLUG} $ENV{APPVEYOR_REPO_NAME} $ENV{PULL_REQUEST_SLUG} $ENV{REPO_SLUG})
-# branch name
-SET_DEFAULT_ARG1(BRANCH $ENV{TRAVIS_PULL_REQUEST_BRANCH} $ENV{TRAVIS_BRANCH} $ENV{APPVEYOR_PULL_REQUEST_HEAD_REPO_BRANCH} $ENV{APPVEYOR_REPO_BRANCH} $ENV{GIT_BRANCH} $ENV{BRANCH_NAME} $ENV{BRANCH} ${GIT_BRANCH})
-# pull request number
-SET_DEFAULT_ARG1(PULL_REQUEST_NUM $ENV{TRAVIS_PULL_REQUEST} $ENV{CHANGE_ID} $ENV{APPVEYOR_PULL_REQUEST_NUMBER} $ENV{PULL_REQUEST_NUM})
-# get the event type, e.g. push, pull_request, api, cron, etc.
-SET_DEFAULT_ARG1(EVENT_TYPE $ENV{TRAVIS_EVENT_TYPE} ${EVENT_TYPE})
-
-if("${BRANCH}" STREQUAL "")
-    message(STATUS "Checked: environment variables for Travis, Appveyor, Jenkins (git plugin), BRANCH_NAME, BRANCH and 'git branch --show-current'")
-    message(FATAL_ERROR "Error! Git branch could not be determined. Please provide -DBRANCH=<name>")
-endif()
-
-#----------------------------------------------------------------------------------------#
-#
-#   Set default values if not provided on command-line
-#
-#----------------------------------------------------------------------------------------#
-
-SET_DEFAULT(SOURCE_DIR      "${WORKING_DIR}")           # source directory
-SET_DEFAULT(BINARY_DIR      "${WORKING_DIR}/build")     # build directory
-SET_DEFAULT(BUILD_TYPE      "${CMAKE_BUILD_TYPE}")      # Release, Debug, etc.
-SET_DEFAULT(MODEL           "Continuous")               # Continuous, Nightly, or Experimental
-SET_DEFAULT(JOBS            1)                          # number of parallel ctests
-SET_DEFAULT(CTEST_COMMAND   "${CMAKE_CTEST_COMMAND}")   # just in case
-SET_DEFAULT(CTEST_ARGS      "-V --output-on-failure")   # extra arguments when ctest is called
-SET_DEFAULT(GIT_EXECUTABLE  "git")                      # ctest_update
-SET_DEFAULT(TARGET          "all")                      # build target
-SET_DEFAULT_ARG1(SITE       "$ENV{SITE}"
-                            "${HOSTNAME}")              # update site
-SET_DEFAULT_ARG1(BUILD_JOBS "$ENV{BUILD_JOBS}"
-                            "${NUM_PROCESSORS}")        # number of parallel compile jobs
-#
-#   The variable below correspond to ctest arguments, i.e. START,END,STRIDE are
-#   '-I START,END,STRIDE'
-#
-SET_DEFAULT(START           "")
-SET_DEFAULT(END             "")
-SET_DEFAULT(STRIDE          "")
-SET_DEFAULT(INCLUDE         "")
-SET_DEFAULT(EXCLUDE         "")
-SET_DEFAULT(INCLUDE_LABEL   "")
-SET_DEFAULT(EXCLUDE_LABEL   "")
-SET_DEFAULT(PARALLEL_LEVEL  "")
-SET_DEFAULT(STOP_TIME       "")
-SET_DEFAULT(LABELS          "")
-SET_DEFAULT(NOTES           "")
-
-# default static build tag for Nightly
-set(BUILD_TAG "${BRANCH}")
-
-if(NOT BUILD_TYPE)
-    # default for kokkos if not specified
-    set(BUILD_TYPE "RelWithDebInfo")
-endif()
-
-# generate dynamic name if continuous or experimental model
-if(NOT "${MODEL}" STREQUAL "Nightly")
-    if(EVENT_TYPE AND PULL_REQUEST_NUM)
-        # e.g. pull_request/123
-        if(AUTHOR)
-            set(BUILD_TAG "${AUTHOR}/${EVENT_TYPE}/${PULL_REQUEST_NUM}")
-        else()
-            set(BUILD_TAG "${EVENT_TYPE}/${PULL_REQUEST_NUM}")
-        endif()
-    elseif(SLUG)
-        # e.g. owner_name/repo_name
-        set(BUILD_TAG "${SLUG}")
-    elseif(AUTHOR)
-        set(BUILD_TAG "${AUTHOR}/${BRANCH}")
-    endif()
-    if(EVENT_TYPE AND NOT PULL_REQUEST_NUM)
-        set(BUILD_TAG "${BUILD_TAG}-${EVENT_TYPE}")
-    endif()
-endif()
-
-# unnecessary
-string(REPLACE "/remotes/" "/" BUILD_TAG "${BUILD_TAG}")
-string(REPLACE "/origin/" "/" BUILD_TAG "${BUILD_TAG}")
-
-message(STATUS "BUILD_TAG: ${BUILD_TAG}")
-
-set(BUILD_NAME "[${BUILD_TAG}] [${BUILD_NAME}-${BUILD_TYPE}]")
-
-# colons in build name create extra (empty) entries in CDash
-string(REPLACE ":" "-" BUILD_NAME "${BUILD_NAME}")
-# unnecessary info
-string(REPLACE "/merge]" "]" BUILD_NAME "${BUILD_NAME}")
-# consistency
-string(REPLACE "/pr/" "/pull/" BUILD_NAME "${BUILD_NAME}")
-string(REPLACE "pull_request/" "pull/" BUILD_NAME "${BUILD_NAME}")
-# miscellaneous from missing fields
-string(REPLACE "--" "-" BUILD_NAME "${BUILD_NAME}")
-string(REPLACE "-]" "]" BUILD_NAME "${BUILD_NAME}")
-
-# check binary directory
-if(EXISTS ${BINARY_DIR})
-    if(NOT IS_DIRECTORY "${BINARY_DIR}")
-        message(FATAL_ERROR "Error! '${BINARY_DIR}' already exists and is not a directory!")
-    endif()
-    file(GLOB BINARY_DIR_FILES "${BINARY_DIR}/*")
-    if(NOT "${BINARY_DIR_FILES}" STREQUAL "")
-        message(FATAL_ERROR "Error! '${BINARY_DIR}' already exists and is not empty!")
-    endif()
-endif()
-
-get_filename_component(SOURCE_REALDIR ${SOURCE_DIR} REALPATH)
-get_filename_component(BINARY_REALDIR ${BINARY_DIR} REALPATH)
-
-#----------------------------------------------------------------------------------------#
-#
-#   Generate the CTestConfig.cmake
-#
-#----------------------------------------------------------------------------------------#
-
-set(CONFIG_ARGS)
-foreach(_ARG ${KOKKOS_CMAKE_ARGS})
-    if(NOT "${${_ARG}}" STREQUAL "")
-        get_property(_ARG_TYPE CACHE ${_ARG} PROPERTY TYPE)
-        if("${_ARG_TYPE}" STREQUAL "UNINITIALIZED")
-            if("${${_ARG}}" STREQUAL "ON" OR "${${_ARG}}" STREQUAL "OFF")
-                set(_ARG_TYPE "BOOL")
-            elseif(EXISTS "${${_ARG}}" AND NOT IS_DIRECTORY "${${_ARG}}")
-                set(_ARG_TYPE "FILEPATH")
-            elseif(EXISTS "${${_ARG}}" AND IS_DIRECTORY "${${_ARG}}")
-                set(_ARG_TYPE "PATH")
-            elseif(NOT "${${_ARG}}" STREQUAL "")
-                set(_ARG_TYPE "STRING")
-            endif()
-        endif()
-        set(CONFIG_ARGS "${CONFIG_ARGS}set(${_ARG} \"${${_ARG}}\" CACHE ${_ARG_TYPE} \"\")\n")
-    endif()
-endforeach()
-
-file(WRITE ${BINARY_REALDIR}/initial-cache.cmake
-"
-set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS}\" CACHE STRING \"\")
-${CONFIG_ARGS}
-")
-
-file(READ ${BINARY_REALDIR}/initial-cache.cmake _CACHE_INFO)
-message(STATUS "Initial cache:\n${_CACHE_INFO}")
-
-# initialize the cache
-set(CONFIG_ARGS "-C ${BINARY_REALDIR}/initial-cache.cmake")
-
-
-# generate the CTestConfig.cmake
-configure_file(
-    ${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake.in
-    ${BINARY_REALDIR}/CTestConfig.cmake
-    @ONLY)
-
-# copy/generate the dashboard script
-configure_file(
-    ${CMAKE_CURRENT_LIST_DIR}/KokkosCTest.cmake.in
-    ${BINARY_REALDIR}/KokkosCTest.cmake
-    @ONLY)
-
-# custom CTest settings go in ${BINARY_DIR}/CTestCustom.cmake
-execute_process(
-    COMMAND             ${CMAKE_COMMAND} -E touch CTestCustom.cmake
-    WORKING_DIRECTORY   ${BINARY_REALDIR}
-    )
-
-#----------------------------------------------------------------------------------------#
-#
-#   Execute CTest
-#
-#----------------------------------------------------------------------------------------#
-
-message(STATUS "")
-message(STATUS "BUILD_NAME: ${BUILD_NAME}")
-message(STATUS "Executing '${CTEST_COMMAND} -S KokkosCTest.cmake ${CTEST_ARGS}'...")
-message(STATUS "")
-
-# e.g. -DCTEST_ARGS="--output-on-failure -VV" should really be -DCTEST_ARGS="--output-on-failure;-VV"
-string(REPLACE " " ";" CTEST_ARGS "${CTEST_ARGS}")
-
-execute_process(
-    COMMAND             ${CTEST_COMMAND} -S KokkosCTest.cmake ${CTEST_ARGS}
-    RESULT_VARIABLE     RET
-    WORKING_DIRECTORY   ${BINARY_REALDIR}
-    )
-
-# ensure that any non-zero result variable gets propagated
-if(NOT RET EQUAL 0)
-    message(FATAL_ERROR "CTest return non-zero exit code: ${RET}")
-endif()
diff --git a/packages/kokkos/cmake/KokkosCTest.cmake.in b/packages/kokkos/cmake/KokkosCTest.cmake.in
deleted file mode 100644
index b6917f3cc1897aa6b1f0876560bb08c0c87b4c3a..0000000000000000000000000000000000000000
--- a/packages/kokkos/cmake/KokkosCTest.cmake.in
+++ /dev/null
@@ -1,261 +0,0 @@
-cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
-
-if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake")
-    include("${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake")
-endif()
-
-include(ProcessorCount)
-ProcessorCount(CTEST_PROCESSOR_COUNT)
-
-cmake_policy(SET CMP0009 NEW)
-cmake_policy(SET CMP0011 NEW)
-
-# ---------------------------------------------------------------------------- #
-# -- Commands
-# ---------------------------------------------------------------------------- #
-find_program(CTEST_CMAKE_COMMAND    NAMES cmake)
-find_program(CTEST_UNAME_COMMAND    NAMES uname)
-
-find_program(CTEST_BZR_COMMAND      NAMES bzr)
-find_program(CTEST_CVS_COMMAND      NAMES cvs)
-find_program(CTEST_GIT_COMMAND      NAMES git)
-find_program(CTEST_HG_COMMAND       NAMES hg)
-find_program(CTEST_P4_COMMAND       NAMES p4)
-find_program(CTEST_SVN_COMMAND      NAMES svn)
-
-find_program(VALGRIND_COMMAND       NAMES valgrind)
-find_program(GCOV_COMMAND           NAMES gcov)
-find_program(LCOV_COMMAND           NAMES llvm-cov)
-find_program(MEMORYCHECK_COMMAND    NAMES valgrind )
-
-set(MEMORYCHECK_TYPE Valgrind)
-# set(MEMORYCHECK_TYPE Purify)
-# set(MEMORYCHECK_TYPE BoundsChecker)
-# set(MEMORYCHECK_TYPE ThreadSanitizer)
-# set(MEMORYCHECK_TYPE AddressSanitizer)
-# set(MEMORYCHECK_TYPE LeakSanitizer)
-# set(MEMORYCHECK_TYPE MemorySanitizer)
-# set(MEMORYCHECK_TYPE UndefinedBehaviorSanitizer)
-set(MEMORYCHECK_COMMAND_OPTIONS "--trace-children=yes --leak-check=full")
-
-# ---------------------------------------------------------------------------- #
-# -- Settings
-# ---------------------------------------------------------------------------- #
-## -- Process timeout in seconds
-set(CTEST_TIMEOUT           "7200")
-## -- Set output to English
-set(ENV{LC_MESSAGES}        "en_EN" )
-
-
-# ---------------------------------------------------------------------------- #
-# -- Copy ctest configuration file
-# ---------------------------------------------------------------------------- #
-macro(COPY_CTEST_CONFIG_FILES)
-
-    foreach(_FILE CTestConfig.cmake CTestCustom.cmake)
-
-        # if current directory is not binary or source directory
-        if(NOT "${CMAKE_CURRENT_LIST_DIR}" STREQUAL "${CTEST_BINARY_DIRECTORY}" AND
-           NOT "${CTEST_SOURCE_DIRECTORY}" STREQUAL "${CTEST_BINARY_DIRECTORY}")
-
-            # if file exists in current directory
-            if(EXISTS ${CMAKE_CURRENT_LIST_DIR}/${_FILE})
-                configure_file(${CMAKE_CURRENT_LIST_DIR}/${_FILE}
-                    ${CTEST_BINARY_DIRECTORY}/${_FILE} COPYONLY)
-            endif()
-
-        # if source and binary differ
-        elseif(NOT "${CTEST_SOURCE_DIRECTORY}" STREQUAL "${CTEST_BINARY_DIRECTORY}")
-
-            # if file exists in source directory but not in binary directory
-            if(EXISTS ${CTEST_SOURCE_DIRECTORY}/${_FILE} AND
-               NOT EXISTS ${CTEST_BINARY_DIRECTORY}/${_FILE})
-                configure_file(${CTEST_SOURCE_DIRECTORY}/${_FILE}
-                    ${CTEST_BINARY_DIRECTORY}/${_FILE} COPYONLY)
-            endif()
-
-        endif()
-    endforeach()
-
-endmacro()
-
-ctest_read_custom_files("${CMAKE_CURRENT_LIST_DIR}")
-
-message(STATUS "CTEST_MODEL: ${CTEST_MODEL}")
-
-#-------------------------------------------------------------------------#
-# Start
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running START_CTEST stage...")
-message(STATUS "")
-
-ctest_start(${CTEST_MODEL} TRACK ${CTEST_MODEL} ${APPEND_CTEST}
-    ${CTEST_SOURCE_DIRECTORY} ${CTEST_BINARY_DIRECTORY})
-
-
-#-------------------------------------------------------------------------#
-# Config
-#
-copy_ctest_config_files()
-ctest_read_custom_files("${CTEST_BINARY_DIRECTORY}")
-
-
-#-------------------------------------------------------------------------#
-# Update
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_UPDATE stage...")
-message(STATUS "")
-
-ctest_update(SOURCE "${CTEST_SOURCE_DIRECTORY}"
-    RETURN_VALUE up_ret)
-
-
-#-------------------------------------------------------------------------#
-# Configure
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_CONFIGURE stage...")
-message(STATUS "")
-
-ctest_configure(BUILD "${CTEST_BINARY_DIRECTORY}"
-    SOURCE ${CTEST_SOURCE_DIRECTORY}
-    ${APPEND_CTEST}
-    OPTIONS "${CTEST_CONFIGURE_OPTIONS}"
-    RETURN_VALUE config_ret)
-
-
-#-------------------------------------------------------------------------#
-# Echo configure log bc Damien wants to delay merging this PR for eternity
-#
-file(GLOB _configure_log "${CTEST_BINARY_DIRECTORY}/Testing/Temporary/LastConfigure*.log")
-# should only have one but loop just for safety
-foreach(_LOG ${_configure_log})
-    file(READ ${_LOG} _LOG_MESSAGE)
-    message(STATUS "Configure Log: ${_LOG}")
-    message(STATUS "\n${_LOG_MESSAGE}\n")
-endforeach()
-
-
-#-------------------------------------------------------------------------#
-# Build
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_BUILD stage...")
-message(STATUS "")
-
-ctest_build(BUILD "${CTEST_BINARY_DIRECTORY}"
-    ${APPEND_CTEST}
-    RETURN_VALUE build_ret)
-
-
-#-------------------------------------------------------------------------#
-# Echo build log bc Damien wants to delay merging this PR for eternity
-#
-file(GLOB _build_log "${CTEST_BINARY_DIRECTORY}/Testing/Temporary/LastBuild*.log")
-# should only have one but loop just for safety
-foreach(_LOG ${_build_log})
-    file(READ ${_LOG} _LOG_MESSAGE)
-    message(STATUS "Build Log: ${_LOG}")
-    message(STATUS "\n${_LOG_MESSAGE}\n")
-endforeach()
-
-
-#-------------------------------------------------------------------------#
-# Test
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_TEST stage...")
-message(STATUS "")
-
-ctest_test(RETURN_VALUE test_ret
-    ${APPEND_CTEST}
-    ${START_CTEST}
-    ${END_CTEST}
-    ${STRIDE_CTEST}
-    ${INCLUDE_CTEST}
-    ${EXCLUDE_CTEST}
-    ${INCLUDE_LABEL_CTEST}
-    ${EXCLUDE_LABEL_CTEST}
-    ${PARALLEL_LEVEL_CTEST}
-    ${STOP_TIME_CTEST}
-    SCHEDULE_RANDOM OFF)
-
-
-#-------------------------------------------------------------------------#
-# Coverage
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_COVERAGE stage...")
-message(STATUS "")
-
-execute_process(COMMAND ${CTEST_COVERAGE_COMMAND} ${CTEST_COVERAGE_EXTRA_FLAGS}
-    WORKING_DIRECTORY ${CTEST_BINARY_DIRECTORY}
-    ERROR_QUIET)
-
-ctest_coverage(${APPEND_CTEST}
-    ${CTEST_COVERAGE_LABELS}
-    RETURN_VALUE cov_ret)
-
-
-#-------------------------------------------------------------------------#
-# MemCheck
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_MEMCHECK stage...")
-message(STATUS "")
-
-ctest_memcheck(RETURN_VALUE mem_ret
-    ${APPEND_CTEST}
-    ${START_CTEST}
-    ${END_CTEST}
-    ${STRIDE_CTEST}
-    ${INCLUDE_CTEST}
-    ${EXCLUDE_CTEST}
-    ${INCLUDE_LABEL_CTEST}
-    ${EXCLUDE_LABEL_CTEST}
-    ${PARALLEL_LEVEL_CTEST})
-
-
-#-------------------------------------------------------------------------#
-# Submit
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_SUBMIT stage...")
-message(STATUS "")
-
-file(GLOB_RECURSE NOTE_FILES "${CTEST_BINARY_DIRECTORY}/*CTestNotes.cmake")
-foreach(_FILE ${NOTE_FILES})
-    message(STATUS "Including CTest notes files: \"${_FILE}\"...")
-    include("${_FILE}")
-endforeach()
-
-# capture submit error so it doesn't fail because of a submission error
-ctest_submit(RETURN_VALUE submit_ret
-    RETRY_COUNT 2
-    RETRY_DELAY 10
-    CAPTURE_CMAKE_ERROR submit_err)
-
-#-------------------------------------------------------------------------#
-# Submit
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Finished ${CTEST_MODEL} Stages (${STAGES})")
-message(STATUS "")
-
-
-#-------------------------------------------------------------------------#
-# Non-zero exit codes for important errors
-#
-if(NOT config_ret EQUAL 0)
-    message(FATAL_ERROR "Error during configuration! Exit code: ${config_ret}")
-endif()
-
-if(NOT build_ret EQUAL 0)
-    message(FATAL_ERROR "Error during build! Exit code: ${build_ret}")
-endif()
-
-if(NOT test_ret EQUAL 0)
-    message(FATAL_ERROR "Error during testing! Exit code: ${test_ret}")
-endif()
diff --git a/packages/kokkos/cmake/KokkosCore_config.h.in b/packages/kokkos/cmake/KokkosCore_config.h.in
index 3455b0cb42e78c7e17286c70edd9f19274b8dcfb..07baa0a5f09a708d344a9ef72510baa6f4b8e15b 100644
--- a/packages/kokkos/cmake/KokkosCore_config.h.in
+++ b/packages/kokkos/cmake/KokkosCore_config.h.in
@@ -41,6 +41,7 @@
 #cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA
 #cmakedefine KOKKOS_ENABLE_CUDA_CONSTEXPR
 #cmakedefine KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
+#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC
 #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
 #cmakedefine KOKKOS_ENABLE_HPX_ASYNC_DISPATCH
 #cmakedefine KOKKOS_ENABLE_DEBUG
@@ -49,17 +50,21 @@
 #cmakedefine KOKKOS_ENABLE_COMPILER_WARNINGS
 #cmakedefine KOKKOS_ENABLE_PROFILING_LOAD_PRINT
 #cmakedefine KOKKOS_ENABLE_TUNING
-#cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE
+#cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE_3
+#cmakedefine KOKKOS_ENABLE_DEPRECATION_WARNINGS
 #cmakedefine KOKKOS_ENABLE_LARGE_MEM_TESTS
 #cmakedefine KOKKOS_ENABLE_DUALVIEW_MODIFY_CHECK
 #cmakedefine KOKKOS_ENABLE_COMPLEX_ALIGN
-#cmakedefine KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
+#cmakedefine KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#cmakedefine KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION  // deprecated
+#cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
 
 /* TPL Settings */
 #cmakedefine KOKKOS_ENABLE_HWLOC
 #cmakedefine KOKKOS_USE_LIBRT
 #cmakedefine KOKKOS_ENABLE_HBWSPACE
 #cmakedefine KOKKOS_ENABLE_LIBDL
+#cmakedefine KOKKOS_ENABLE_LIBQUADMATH
 #cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND
 
 #cmakedefine KOKKOS_COMPILER_CUDA_VERSION @KOKKOS_COMPILER_CUDA_VERSION@
@@ -79,6 +84,12 @@
 #cmakedefine KOKKOS_ARCH_POWER8
 #cmakedefine KOKKOS_ARCH_POWER9
 #cmakedefine KOKKOS_ARCH_INTEL_GEN
+#cmakedefine KOKKOS_ARCH_INTEL_DG1
+#cmakedefine KOKKOS_ARCH_INTEL_GEN9
+#cmakedefine KOKKOS_ARCH_INTEL_GEN11
+#cmakedefine KOKKOS_ARCH_INTEL_GEN12LP
+#cmakedefine KOKKOS_ARCH_INTEL_XEHP
+#cmakedefine KOKKOS_ARCH_INTEL_GPU
 #cmakedefine KOKKOS_ARCH_KEPLER
 #cmakedefine KOKKOS_ARCH_KEPLER30
 #cmakedefine KOKKOS_ARCH_KEPLER32
@@ -95,6 +106,7 @@
 #cmakedefine KOKKOS_ARCH_VOLTA70
 #cmakedefine KOKKOS_ARCH_VOLTA72
 #cmakedefine KOKKOS_ARCH_TURING75
+#cmakedefine KOKKOS_ARCH_AMPERE
 #cmakedefine KOKKOS_ARCH_AMPERE80
 #cmakedefine KOKKOS_ARCH_AMPERE86
 #cmakedefine KOKKOS_ARCH_AMD_ZEN
diff --git a/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake b/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake
index 8d58d96415808499dc39d44ad3600f5f5a64368e..0c825c59e04248f2cd76d5faf9c6aa16a663bbb1 100644
--- a/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake
+++ b/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake
@@ -29,7 +29,12 @@ ELSE()
 ENDIF()
 
 include(FindPackageHandleStandardArgs)
-FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUDA DEFAULT_MSG FOUND_CUDART FOUND_CUDA_DRIVER)
+IF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL PGI)
+  SET(KOKKOS_CUDA_ERROR "Using NVHPC as host compiler requires at least CMake 3.20.1")
+ELSE()
+  SET(KOKKOS_CUDA_ERROR DEFAULT_MSG)
+ENDIF()
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUDA ${KOKKOS_CUDA_ERROR} FOUND_CUDART FOUND_CUDA_DRIVER)
 IF (FOUND_CUDA_DRIVER AND FOUND_CUDART)
   KOKKOS_CREATE_IMPORTED_TPL(CUDA INTERFACE
     LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart
diff --git a/packages/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake b/packages/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..be70b711e0f92ac9d99e3c3fdd2770430f6c2b68
--- /dev/null
+++ b/packages/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake
@@ -0,0 +1 @@
+KOKKOS_FIND_IMPORTED(LIBQUADMATH HEADER quadmath.h LIBRARY quadmath)
diff --git a/packages/kokkos/cmake/deps/quadmath.cmake b/packages/kokkos/cmake/deps/quadmath.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..826f5021d359e99c2aed8b695de7b601dabaf453
--- /dev/null
+++ b/packages/kokkos/cmake/deps/quadmath.cmake
@@ -0,0 +1,46 @@
+# @HEADER
+# ************************************************************************
+#
+#                        Kokkos v. 3.0
+#       Copyright (2020) National Technology & Engineering
+#               Solutions of Sandia, LLC (NTESS).
+#
+# Under the terms of Contract DE-NA0003525 with NTESS,
+# the U.S. Government retains certain rights in this software.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+#
+# ************************************************************************
+# @HEADER
+
+KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( quadmath
+  REQUIRED_HEADERS quadmath.h
+  REQUIRED_LIBS_NAMES quadmath
+)
diff --git a/packages/kokkos/cmake/kokkos_arch.cmake b/packages/kokkos/cmake/kokkos_arch.cmake
index e8b85542c633eaea6f63c32ada79c7d7b2402794..c4637339f31fa47e80d9b53a1435b3ddc7641573 100644
--- a/packages/kokkos/cmake/kokkos_arch.cmake
+++ b/packages/kokkos/cmake/kokkos_arch.cmake
@@ -67,8 +67,13 @@ KOKKOS_ARCH_OPTION(ZEN3            HOST "AMD Zen3 architecture")
 KOKKOS_ARCH_OPTION(VEGA900         GPU  "AMD GPU MI25 GFX900")
 KOKKOS_ARCH_OPTION(VEGA906         GPU  "AMD GPU MI50/MI60 GFX906")
 KOKKOS_ARCH_OPTION(VEGA908         GPU  "AMD GPU MI100 GFX908")
+KOKKOS_ARCH_OPTION(VEGA90A         GPU  "" )
 KOKKOS_ARCH_OPTION(INTEL_GEN       GPU  "Intel GPUs Gen9+")
-
+KOKKOS_ARCH_OPTION(INTEL_DG1       GPU  "Intel Iris XeMAX GPU")
+KOKKOS_ARCH_OPTION(INTEL_GEN9      GPU  "Intel GPU Gen9")
+KOKKOS_ARCH_OPTION(INTEL_GEN11     GPU  "Intel GPU Gen11")
+KOKKOS_ARCH_OPTION(INTEL_GEN12LP   GPU  "Intel GPU Gen12LP")
+KOKKOS_ARCH_OPTION(INTEL_XEHP      GPU  "Intel GPU Xe-HP")
 
 
 IF(KOKKOS_ENABLE_COMPILER_WARNINGS)
@@ -76,6 +81,12 @@ IF(KOKKOS_ENABLE_COMPILER_WARNINGS)
     "-Wall" "-Wunused-parameter" "-Wshadow" "-pedantic"
     "-Wsign-compare" "-Wtype-limits" "-Wuninitialized")
 
+  # NOTE KOKKOS_ prefixed variable (all uppercase) is not set yet because TPLs are processed after ARCH
+  IF(Kokkos_ENABLE_LIBQUADMATH)
+    # warning: non-standard suffix on floating constant [-Wpedantic]
+    LIST(REMOVE_ITEM COMMON_WARNINGS "-pedantic")
+  ENDIF()
+
   # OpenMPTarget compilers give erroneous warnings about sign comparison in loops
   IF(KOKKOS_ENABLE_OPENMPTARGET)
     LIST(REMOVE_ITEM COMMON_WARNINGS "-Wsign-compare")
@@ -86,7 +97,7 @@ IF(KOKKOS_ENABLE_COMPILER_WARNINGS)
 
   COMPILER_SPECIFIC_FLAGS(
     COMPILER_ID CMAKE_CXX_COMPILER_ID
-    PGI         NO-VALUE-SPECIFIED
+    NVHPC       NO-VALUE-SPECIFIED
     GNU         ${GNU_WARNINGS}
     DEFAULT     ${COMMON_WARNINGS}
   )
@@ -158,16 +169,18 @@ ENDIF()
 
 IF (KOKKOS_ARCH_ARMV80)
   COMPILER_SPECIFIC_FLAGS(
-    Cray NO-VALUE-SPECIFIED
-    PGI  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    Cray    NO-VALUE-SPECIFIED
+    NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -march=armv8-a
   )
 ENDIF()
 
 IF (KOKKOS_ARCH_ARMV81)
   COMPILER_SPECIFIC_FLAGS(
-    Cray NO-VALUE-SPECIFIED
-    PGI  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    Cray    NO-VALUE-SPECIFIED
+    NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -march=armv8.1-a
   )
 ENDIF()
@@ -175,8 +188,9 @@ ENDIF()
 IF (KOKKOS_ARCH_ARMV8_THUNDERX)
   SET(KOKKOS_ARCH_ARMV80 ON) #Not a cache variable
   COMPILER_SPECIFIC_FLAGS(
-    Cray NO-VALUE-SPECIFIED
-    PGI  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    Cray    NO-VALUE-SPECIFIED
+    NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -march=armv8-a -mtune=thunderx
   )
 ENDIF()
@@ -184,23 +198,28 @@ ENDIF()
 IF (KOKKOS_ARCH_ARMV8_THUNDERX2)
   SET(KOKKOS_ARCH_ARMV81 ON) #Not a cache variable
   COMPILER_SPECIFIC_FLAGS(
-    Cray NO-VALUE-SPECIFIED
-    PGI  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    Cray    NO-VALUE-SPECIFIED
+    NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -mcpu=thunderx2t99 -mtune=thunderx2t99
   )
 ENDIF()
 
 IF (KOKKOS_ARCH_A64FX)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -march=armv8.2-a+sve
-    Clang -march=armv8.2-a+sve -msve-vector-bits=512
-    GCC -march=armv8.2-a+sve -msve-vector-bits=512
+    Clang   -march=armv8.2-a+sve -msve-vector-bits=512
+    GCC     -march=armv8.2-a+sve -msve-vector-bits=512
   )
 ENDIF()
 
 IF (KOKKOS_ARCH_ZEN)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -mavx2
+    NVHPC   -tp=zen
     DEFAULT -march=znver1 -mtune=znver1
   )
   SET(KOKKOS_ARCH_AMD_ZEN  ON)
@@ -209,7 +228,9 @@ ENDIF()
 
 IF (KOKKOS_ARCH_ZEN2)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -mavx2
+    NVHPC   -tp=zen2
     DEFAULT -march=znver2 -mtune=znver2
   )
   SET(KOKKOS_ARCH_AMD_ZEN2 ON)
@@ -218,7 +239,9 @@ ENDIF()
 
 IF (KOKKOS_ARCH_ZEN3)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -mavx2
+    NVHPC   -tp=zen2
     DEFAULT -march=znver3 -mtune=znver3
   )
   SET(KOKKOS_ARCH_AMD_ZEN3 ON)
@@ -227,8 +250,9 @@ ENDIF()
 
 IF (KOKKOS_ARCH_WSM)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -xSSE4.2
-    PGI     -tp=nehalem
+    NVHPC   -tp=px
     Cray    NO-VALUE-SPECIFIED
     DEFAULT -msse4.2
   )
@@ -238,8 +262,9 @@ ENDIF()
 IF (KOKKOS_ARCH_SNB OR KOKKOS_ARCH_AMDAVX)
   SET(KOKKOS_ARCH_AVX ON)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -mavx
-    PGI     -tp=sandybridge
+    NVHPC   -tp=sandybridge
     Cray    NO-VALUE-SPECIFIED
     DEFAULT -mavx
   )
@@ -248,8 +273,9 @@ ENDIF()
 IF (KOKKOS_ARCH_HSW)
   SET(KOKKOS_ARCH_AVX2 ON)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -xCORE-AVX2
-    PGI     -tp=haswell
+    NVHPC   -tp=haswell
     Cray    NO-VALUE-SPECIFIED
     DEFAULT -march=core-avx2 -mtune=core-avx2
   )
@@ -258,8 +284,9 @@ ENDIF()
 IF (KOKKOS_ARCH_BDW)
   SET(KOKKOS_ARCH_AVX2 ON)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -xCORE-AVX2
-    PGI     -tp=haswell
+    NVHPC   -tp=haswell
     Cray    NO-VALUE-SPECIFIED
     DEFAULT -march=core-avx2 -mtune=core-avx2 -mrtm
   )
@@ -269,8 +296,9 @@ IF (KOKKOS_ARCH_KNL)
   #avx512-mic
   SET(KOKKOS_ARCH_AVX512MIC ON) #not a cache variable
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -xMIC-AVX512
-    PGI     NO-VALUE-SPECIFIED
+    NVHPC   -tp=knl
     Cray    NO-VALUE-SPECIFIED
     DEFAULT -march=knl -mtune=knl
   )
@@ -279,6 +307,7 @@ ENDIF()
 IF (KOKKOS_ARCH_KNC)
   SET(KOKKOS_USE_ISA_KNC ON)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     DEFAULT -mmic
   )
 ENDIF()
@@ -287,8 +316,9 @@ IF (KOKKOS_ARCH_SKX)
   #avx512-xeon
   SET(KOKKOS_ARCH_AVX512XEON ON)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -xCORE-AVX512
-    PGI     NO-VALUE-SPECIFIED
+    NVHPC   -tp=skylake
     Cray    NO-VALUE-SPECIFIED
     DEFAULT -march=skylake-avx512 -mtune=skylake-avx512 -mrtm
   )
@@ -304,7 +334,8 @@ ENDIF()
 
 IF (KOKKOS_ARCH_POWER7)
   COMPILER_SPECIFIC_FLAGS(
-    PGI     NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -mcpu=power7 -mtune=power7
   )
   SET(KOKKOS_USE_ISA_POWERPCBE ON)
@@ -312,16 +343,16 @@ ENDIF()
 
 IF (KOKKOS_ARCH_POWER8)
   COMPILER_SPECIFIC_FLAGS(
-    PGI     NO-VALUE-SPECIFIED
-    NVIDIA  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    NVHPC   -tp=pwr8
     DEFAULT -mcpu=power8 -mtune=power8
   )
 ENDIF()
 
 IF (KOKKOS_ARCH_POWER9)
   COMPILER_SPECIFIC_FLAGS(
-    PGI     NO-VALUE-SPECIFIED
-    NVIDIA  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    NVHPC   -tp=pwr9
     DEFAULT -mcpu=power9 -mtune=power9
   )
 ENDIF()
@@ -368,7 +399,7 @@ ENDIF()
 
 IF (KOKKOS_ENABLE_SYCL)
   COMPILER_SPECIFIC_FLAGS(
-    DEFAULT -fsycl
+    DEFAULT -fsycl -fno-sycl-id-queries-fit-in-int
   )
   COMPILER_SPECIFIC_OPTIONS(
     DEFAULT -fsycl-unnamed-lambda
@@ -443,20 +474,58 @@ ENDFUNCTION()
 CHECK_AMDGPU_ARCH(VEGA900 gfx900) # Radeon Instinct MI25
 CHECK_AMDGPU_ARCH(VEGA906 gfx906) # Radeon Instinct MI50 and MI60
 CHECK_AMDGPU_ARCH(VEGA908 gfx908)
+CHECK_AMDGPU_ARCH(VEGA90A gfx90a)
 
 IF(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED)
-  MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture currently enabled. "
-                     "Please enable one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.")
+  IF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC)
+    FIND_PROGRAM(ROCM_ENUMERATOR rocm_agent_enumerator)
+    EXECUTE_PROCESS(COMMAND ${ROCM_ENUMERATOR} OUTPUT_VARIABLE GPU_ARCHS)
+    STRING(LENGTH "${GPU_ARCHS}" len_str)
+    # enumerator always output gfx000 as the first line
+    IF(${len_str} LESS 8)
+      MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture currently enabled. "
+                         "Please enable one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.")
+    ENDIF()
+  ELSE()
+    MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture currently enabled. "
+                       "Please enable one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.")
+  ENDIF()
+ENDIF()
+
+MACRO(CHECK_MULTIPLE_INTEL_ARCH)
+  IF(KOKKOS_ARCH_INTEL_GPU)
+    MESSAGE(FATAL_ERROR "Specifying multiple Intel GPU architectures is not allowed!")
+  ENDIF()
+  SET(KOKKOS_ARCH_INTEL_GPU ON)
+ENDMACRO()
+
+IF(KOKKOS_ARCH_INTEL_GEN)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_DG1)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_GEN9)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_GEN11)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_GEN12LP)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_XEHP)
+  CHECK_MULTIPLE_INTEL_ARCH()
 ENDIF()
 
 IF (KOKKOS_ENABLE_OPENMPTARGET)
   SET(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG})
   IF (CLANG_CUDA_ARCH)
-    STRING(REPLACE "sm_" "cc" PGI_CUDA_ARCH ${CLANG_CUDA_ARCH})
+    STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${CLANG_CUDA_ARCH})
     COMPILER_SPECIFIC_FLAGS(
       Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64-nvidia-cuda
-      XL -qtgtarch=${KOKKOS_CUDA_ARCH_FLAG}
-      PGI -gpu=${PGI_CUDA_ARCH}
+      XL    -qtgtarch=${KOKKOS_CUDA_ARCH_FLAG}
+      NVHPC -gpu=${NVHPC_CUDA_ARCH}
     )
   ENDIF()
   SET(CLANG_AMDGPU_ARCH ${KOKKOS_AMDGPU_ARCH_FLAG})
@@ -465,7 +534,7 @@ IF (KOKKOS_ENABLE_OPENMPTARGET)
       Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${CLANG_AMDGPU_ARCH} -fopenmp-targets=amdgcn-amd-amdhsa
     )
   ENDIF()
-  IF (KOKKOS_ARCH_INTEL_GEN)
+  IF (KOKKOS_ARCH_INTEL_GPU)
     COMPILER_SPECIFIC_FLAGS(
       IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__
     )
@@ -485,7 +554,27 @@ IF (KOKKOS_ENABLE_SYCL)
     ENDIF()
   ELSEIF(KOKKOS_ARCH_INTEL_GEN)
     COMPILER_SPECIFIC_FLAGS(
-      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device skl"
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9-"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_GEN9)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_GEN11)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen11"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen12lp"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_DG1)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device dg1"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_XEHP)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device xehp"
     )
   ENDIF()
 ENDIF()
diff --git a/packages/kokkos/cmake/kokkos_compiler_id.cmake b/packages/kokkos/cmake/kokkos_compiler_id.cmake
index 23847263a952ce0e94fe48c58dbdfc50b228b314..5afed4fb0e7ba0cd2bca8250b6f58e4434f483ec 100644
--- a/packages/kokkos/cmake/kokkos_compiler_id.cmake
+++ b/packages/kokkos/cmake/kokkos_compiler_id.cmake
@@ -137,7 +137,7 @@ SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    Clang      4.0.0 or higher"
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    GCC        5.3.0 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    Intel     17.0.0 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    NVCC      9.2.88 or higher")
-SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    HIPCC      3.8.0 or higher")
+SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    HIPCC      4.2.0 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    PGI         17.4 or higher\n")
 
 IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
@@ -158,13 +158,23 @@ ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
   ENDIF()
   SET(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE)
 ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC)
-  IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 3.8.0)
+  IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 4.2.0)
     MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
   ENDIF()
 ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI)
   IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 17.4)
     MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
   ENDIF()
+  # Treat PGI internally as NVHPC to simplify handling both compilers.
+  # Before CMake 3.20 NVHPC was identified as PGI, nvc++ is
+  # backward-compatible to pgc++.
+  SET(KOKKOS_CXX_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE)
+ENDIF()
+
+IF(NOT DEFINED KOKKOS_CXX_HOST_COMPILER_ID)
+  SET(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID})
+ELSEIF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL PGI)
+  SET(KOKKOS_CXX_HOST_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE)
 ENDIF()
 
 STRING(REPLACE "." ";" VERSION_LIST ${KOKKOS_CXX_COMPILER_VERSION})
diff --git a/packages/kokkos/cmake/kokkos_enable_devices.cmake b/packages/kokkos/cmake/kokkos_enable_devices.cmake
index d7f83ddbdf877b672cfc196f89d6b3f61d109087..7fd0794036454da9d8a246fd4a3a19fe2e5cf0ef 100644
--- a/packages/kokkos/cmake/kokkos_enable_devices.cmake
+++ b/packages/kokkos/cmake/kokkos_enable_devices.cmake
@@ -62,7 +62,7 @@ IF(KOKKOS_ENABLE_OPENMP)
       COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
       Clang      -Xcompiler ${ClangOpenMPFlag}
       IntelLLVM  -Xcompiler -fiopenmp
-      PGI        -Xcompiler -mp
+      NVHPC      -Xcompiler -mp
       Cray       NO-VALUE-SPECIFIED
       XL         -Xcompiler -qsmp=omp
       DEFAULT    -Xcompiler -fopenmp
@@ -72,7 +72,7 @@ IF(KOKKOS_ENABLE_OPENMP)
       Clang      ${ClangOpenMPFlag}
       IntelLLVM  -fiopenmp
       AppleClang -Xpreprocessor -fopenmp
-      PGI        -mp
+      NVHPC      -mp
       Cray       NO-VALUE-SPECIFIED
       XL         -qsmp=omp
       DEFAULT    -fopenmp
@@ -94,7 +94,7 @@ IF (KOKKOS_ENABLE_OPENMPTARGET)
     Clang      ${ClangOpenMPFlag} -Wno-openmp-mapping
     IntelLLVM  -fiopenmp -Wno-openmp-mapping
     XL         -qsmp=omp -qoffload -qnoeh
-    PGI        -mp=gpu
+    NVHPC      -mp=gpu
     DEFAULT    -fopenmp
   )
   COMPILER_SPECIFIC_DEFS(
diff --git a/packages/kokkos/cmake/kokkos_enable_options.cmake b/packages/kokkos/cmake/kokkos_enable_options.cmake
index 95bce66c7bee32f8800cbd6e0324f9d4c599c97c..4cb8bd20f5ecb3e519ef64d9e1c31c0a5cb7e431 100644
--- a/packages/kokkos/cmake/kokkos_enable_options.cmake
+++ b/packages/kokkos/cmake/kokkos_enable_options.cmake
@@ -26,9 +26,16 @@ KOKKOS_CFG_DEPENDS(OPTIONS COMPILER_ID)
 # Put a check in just in case people are using this option
 KOKKOS_DEPRECATED_LIST(OPTIONS ENABLE)
 
+# Set the Default for Desul Atomics usage.
+set(_DESUL_ATOMICS_DEFAULT ON)
+
 KOKKOS_ENABLE_OPTION(CUDA_RELOCATABLE_DEVICE_CODE  OFF "Whether to enable relocatable device code (RDC) for CUDA")
 KOKKOS_ENABLE_OPTION(CUDA_UVM             OFF "Whether to use unified memory (UM) for CUDA by default")
 KOKKOS_ENABLE_OPTION(CUDA_LDG_INTRINSIC   OFF "Whether to use CUDA LDG intrinsics")
+# As of 08/12/2021 CudaMallocAsync causes issues if UCX is used as MPI communication layer.
+KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC      OFF  "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)")
+KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_3    ON "Whether code deprecated in major release 3 is available" )
+KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" )
 KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE  OFF "Whether to enable relocatable device code (RDC) for HIP")
 KOKKOS_ENABLE_OPTION(HPX_ASYNC_DISPATCH   OFF "Whether HPX supports asynchronous dispatch")
 KOKKOS_ENABLE_OPTION(TESTS         OFF  "Whether to build the unit tests")
@@ -50,6 +57,9 @@ KOKKOS_ENABLE_OPTION(TUNING               OFF "Whether to create bindings for tu
 KOKKOS_ENABLE_OPTION(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops")
 KOKKOS_ENABLE_OPTION(LAUNCH_COMPILER      ON  "Whether to potentially use the launch compiler")
 
+# This option will go away eventually, but allows fallback to old implementation when needed.
+KOKKOS_ENABLE_OPTION(IMPL_DESUL_ATOMICS   ON  "Whether to use desul based atomics - option only during beta")
+
 IF (KOKKOS_ENABLE_CUDA)
   SET(KOKKOS_COMPILER_CUDA_VERSION "${KOKKOS_COMPILER_VERSION_MAJOR}${KOKKOS_COMPILER_VERSION_MINOR}")
 ENDIF()
diff --git a/packages/kokkos/cmake/kokkos_functions.cmake b/packages/kokkos/cmake/kokkos_functions.cmake
index e1a3e5f8bd00802f465390f332138bbadd4f1a33..02c9a911b1b827994b7d4a1e0c004cfb55afd749 100644
--- a/packages/kokkos/cmake/kokkos_functions.cmake
+++ b/packages/kokkos/cmake/kokkos_functions.cmake
@@ -773,7 +773,7 @@ FUNCTION(kokkos_link_tpl TARGET)
 ENDFUNCTION()
 
 FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER)
-  SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelLLVM GNU HIPCC Fujitsu)
+  SET(COMPILERS NVIDIA NVHPC XL XLClang DEFAULT Cray Intel Clang AppleClang IntelLLVM GNU HIPCC Fujitsu)
   CMAKE_PARSE_ARGUMENTS(
     PARSE
     "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES"
diff --git a/packages/kokkos/cmake/kokkos_test_cxx_std.cmake b/packages/kokkos/cmake/kokkos_test_cxx_std.cmake
index 707fb000af528694780d6668f160a3fee3472a69..1eb0592c7f054185e566f053faa931029f92fbc1 100644
--- a/packages/kokkos/cmake/kokkos_test_cxx_std.cmake
+++ b/packages/kokkos/cmake/kokkos_test_cxx_std.cmake
@@ -140,7 +140,7 @@ IF (NOT KOKKOS_CXX_STANDARD_FEATURE)
   IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray)
     INCLUDE(${KOKKOS_SRC_PATH}/cmake/cray.cmake)
     kokkos_set_cray_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD})
-  ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI)
+  ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
     INCLUDE(${KOKKOS_SRC_PATH}/cmake/pgi.cmake)
     kokkos_set_pgi_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD})
   ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel)
diff --git a/packages/kokkos/cmake/kokkos_tpls.cmake b/packages/kokkos/cmake/kokkos_tpls.cmake
index d8d044c9d75384a1d8d312a94708623c735d121f..51bad521c4878c00b6b8c7587d7233c26a1d4ba9 100644
--- a/packages/kokkos/cmake/kokkos_tpls.cmake
+++ b/packages/kokkos/cmake/kokkos_tpls.cmake
@@ -67,6 +67,12 @@ SET(PTHREAD_DEFAULT OFF)
 ENDIF()
 KOKKOS_TPL_OPTION(PTHREAD ${PTHREAD_DEFAULT} TRIBITS Pthread)
 
+IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_quadmath)
+  SET(LIBQUADMATH_DEFAULT ON)
+ELSE()
+  SET(LIBQUADMATH_DEFAULT OFF)
+ENDIF()
+KOKKOS_TPL_OPTION(LIBQUADMATH ${LIBQUADMATH_DEFAULT} TRIBITS quadmath)
 
 #Make sure we use our local FindKokkosCuda.cmake
 KOKKOS_IMPORT_TPL(HPX INTERFACE)
@@ -78,6 +84,7 @@ KOKKOS_IMPORT_TPL(LIBDL)
 KOKKOS_IMPORT_TPL(MEMKIND)
 KOKKOS_IMPORT_TPL(PTHREAD INTERFACE)
 KOKKOS_IMPORT_TPL(ROCM INTERFACE)
+KOKKOS_IMPORT_TPL(LIBQUADMATH)
 
 #Convert list to newlines (which CMake doesn't always like in cache variables)
 STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}")
diff --git a/packages/kokkos/cmake/tpls/FindTPLquadmath.cmake b/packages/kokkos/cmake/tpls/FindTPLquadmath.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..1f7587da808fd587f6380079f9a672f124b3a25b
--- /dev/null
+++ b/packages/kokkos/cmake/tpls/FindTPLquadmath.cmake
@@ -0,0 +1,46 @@
+# @HEADER
+# ************************************************************************
+#
+#                        Kokkos v. 3.0
+#       Copyright (2020) National Technology & Engineering
+#               Solutions of Sandia, LLC (NTESS).
+#
+# Under the terms of Contract DE-NA0003525 with NTESS,
+# the U.S. Government retains certain rights in this software.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+#
+# ************************************************************************
+# @HEADER
+
+TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( quadmath
+  REQUIRED_HEADERS quadmath.h
+  REQUIRED_LIBS_NAMES quadmath
+)
diff --git a/packages/kokkos/containers/performance_tests/TestDynRankView.hpp b/packages/kokkos/containers/performance_tests/TestDynRankView.hpp
index 8c507c76621d09b134ad94f12da589e8c31a014c..7ed9a0271a51db453ff29a982e57cd17d70a642d 100644
--- a/packages/kokkos/containers/performance_tests/TestDynRankView.hpp
+++ b/packages/kokkos/containers/performance_tests/TestDynRankView.hpp
@@ -48,7 +48,7 @@
 #include <Kokkos_DynRankView.hpp>
 #include <vector>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 // Compare performance of DynRankView to View, specific focus on the parenthesis
 // operators
diff --git a/packages/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp b/packages/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
index 65de551b2715f1eb31f4385fa0cb2a455bca6a4f..16b74a4997e5f1e643e095e167253829d47a050a 100644
--- a/packages/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
+++ b/packages/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
@@ -48,7 +48,7 @@
 #include <vector>
 #include <algorithm>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 // This test will simulate global ids
 
diff --git a/packages/kokkos/containers/performance_tests/TestScatterView.hpp b/packages/kokkos/containers/performance_tests/TestScatterView.hpp
index 0f3ba103efc5d09d012e3cc35cbfa41fa8be9170..8a23f59d32cdd4f6290465ad41fa70d521e39bfb 100644
--- a/packages/kokkos/containers/performance_tests/TestScatterView.hpp
+++ b/packages/kokkos/containers/performance_tests/TestScatterView.hpp
@@ -46,7 +46,7 @@
 #define KOKKOS_TEST_SCATTER_VIEW_HPP
 
 #include <Kokkos_ScatterView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 namespace Perf {
 
diff --git a/packages/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp b/packages/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
index c31412552ad696ada0dad4fd1058f76290282256..4547d5c35758e2eadc0e5029779f0d2e23fc4081 100644
--- a/packages/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
+++ b/packages/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
@@ -43,7 +43,7 @@
 #ifndef KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP
 #define KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 #include <iostream>
 #include <iomanip>
diff --git a/packages/kokkos/containers/src/Kokkos_Bitset.hpp b/packages/kokkos/containers/src/Kokkos_Bitset.hpp
index ea1d6dde5d26e9baf719281a0d8f13bb80ec59f8..c5b66f05a3ce0b7778fdcbc8e7a3e766301273d0 100644
--- a/packages/kokkos/containers/src/Kokkos_Bitset.hpp
+++ b/packages/kokkos/containers/src/Kokkos_Bitset.hpp
@@ -76,20 +76,25 @@ class Bitset {
   using execution_space = Device;
   using size_type       = unsigned int;
 
-  enum { BIT_SCAN_REVERSE = 1u };
-  enum { MOVE_HINT_BACKWARD = 2u };
-
-  enum {
-    BIT_SCAN_FORWARD_MOVE_HINT_FORWARD  = 0u,
-    BIT_SCAN_REVERSE_MOVE_HINT_FORWARD  = BIT_SCAN_REVERSE,
-    BIT_SCAN_FORWARD_MOVE_HINT_BACKWARD = MOVE_HINT_BACKWARD,
-    BIT_SCAN_REVERSE_MOVE_HINT_BACKWARD = BIT_SCAN_REVERSE | MOVE_HINT_BACKWARD
-  };
+  static constexpr unsigned BIT_SCAN_REVERSE   = 1u;
+  static constexpr unsigned MOVE_HINT_BACKWARD = 2u;
+
+  static constexpr unsigned BIT_SCAN_FORWARD_MOVE_HINT_FORWARD = 0u;
+  static constexpr unsigned BIT_SCAN_REVERSE_MOVE_HINT_FORWARD =
+      BIT_SCAN_REVERSE;
+  static constexpr unsigned BIT_SCAN_FORWARD_MOVE_HINT_BACKWARD =
+      MOVE_HINT_BACKWARD;
+  static constexpr unsigned BIT_SCAN_REVERSE_MOVE_HINT_BACKWARD =
+      BIT_SCAN_REVERSE | MOVE_HINT_BACKWARD;
 
  private:
-  enum { block_size = static_cast<unsigned>(sizeof(unsigned) * CHAR_BIT) };
-  enum { block_mask = block_size - 1u };
-  enum { block_shift = Kokkos::Impl::integral_power_of_two(block_size) };
+  enum : unsigned {
+    block_size = static_cast<unsigned>(sizeof(unsigned) * CHAR_BIT)
+  };
+  enum : unsigned { block_mask = block_size - 1u };
+  enum : unsigned {
+    block_shift = Kokkos::Impl::integral_power_of_two(block_size)
+  };
 
  public:
   /// constructor
@@ -317,14 +322,18 @@ class ConstBitset {
   enum { block_shift = Kokkos::Impl::integral_power_of_two(block_size) };
 
  public:
+  KOKKOS_FUNCTION
   ConstBitset() : m_size(0) {}
 
+  KOKKOS_FUNCTION
   ConstBitset(Bitset<Device> const& rhs)
       : m_size(rhs.m_size), m_blocks(rhs.m_blocks) {}
 
+  KOKKOS_FUNCTION
   ConstBitset(ConstBitset<Device> const& rhs)
       : m_size(rhs.m_size), m_blocks(rhs.m_blocks) {}
 
+  KOKKOS_FUNCTION
   ConstBitset<Device>& operator=(Bitset<Device> const& rhs) {
     this->m_size   = rhs.m_size;
     this->m_blocks = rhs.m_blocks;
@@ -332,6 +341,7 @@ class ConstBitset {
     return *this;
   }
 
+  KOKKOS_FUNCTION
   ConstBitset<Device>& operator=(ConstBitset<Device> const& rhs) {
     this->m_size   = rhs.m_size;
     this->m_blocks = rhs.m_blocks;
diff --git a/packages/kokkos/containers/src/Kokkos_DualView.hpp b/packages/kokkos/containers/src/Kokkos_DualView.hpp
index 45710d1f737ca14348dd79d698bbc4a581225bbb..f55d0f2b7f3f10b43ea4ee076dc4dea191010449 100644
--- a/packages/kokkos/containers/src/Kokkos_DualView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_DualView.hpp
@@ -597,8 +597,10 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
     }
     if (std::is_same<typename t_host::memory_space,
                      typename t_dev::memory_space>::value) {
-      typename t_dev::execution_space().fence();
-      typename t_host::execution_space().fence();
+      typename t_dev::execution_space().fence(
+          "Kokkos::DualView<>::sync: fence after syncing DualView");
+      typename t_host::execution_space().fence(
+          "Kokkos::DualView<>::sync: fence after syncing DualView");
     }
   }
 
@@ -776,10 +778,11 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
   /// If \c Device is the same as this DualView's device type, then
   /// mark the device's data as modified.  Otherwise, mark the host's
   /// data as modified.
-  template <class Device>
+  template <class Device, class Dummy = DualView,
+            std::enable_if_t<!Dummy::impl_dualview_is_single_device::value>* =
+                nullptr>
   void modify() {
     if (modified_flags.data() == nullptr) return;
-    if (impl_dualview_is_single_device::value) return;
     int dev = get_device_side<Device>();
 
     if (dev == 1) {  // if Device is the same as DualView's device type
@@ -811,8 +814,17 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
 #endif
   }
 
+  template <
+      class Device, class Dummy = DualView,
+      std::enable_if_t<Dummy::impl_dualview_is_single_device::value>* = nullptr>
+  void modify() {
+    return;
+  }
+
+  template <class Dummy = DualView,
+            std::enable_if_t<!Dummy::impl_dualview_is_single_device::value>* =
+                nullptr>
   inline void modify_host() {
-    if (impl_dualview_is_single_device::value) return;
     if (modified_flags.data() != nullptr) {
       modified_flags(0) =
           (modified_flags(1) > modified_flags(0) ? modified_flags(1)
@@ -832,8 +844,17 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
     }
   }
 
+  template <
+      class Dummy = DualView,
+      std::enable_if_t<Dummy::impl_dualview_is_single_device::value>* = nullptr>
+  inline void modify_host() {
+    return;
+  }
+
+  template <class Dummy = DualView,
+            std::enable_if_t<!Dummy::impl_dualview_is_single_device::value>* =
+                nullptr>
   inline void modify_device() {
-    if (impl_dualview_is_single_device::value) return;
     if (modified_flags.data() != nullptr) {
       modified_flags(1) =
           (modified_flags(1) > modified_flags(0) ? modified_flags(1)
@@ -853,6 +874,13 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
     }
   }
 
+  template <
+      class Dummy = DualView,
+      std::enable_if_t<Dummy::impl_dualview_is_single_device::value>* = nullptr>
+  inline void modify_device() {
+    return;
+  }
+
   inline void clear_sync_state() {
     if (modified_flags.data() != nullptr)
       modified_flags(1) = modified_flags(0) = 0;
@@ -875,8 +903,15 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
                const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
                const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
                const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
-    ::Kokkos::realloc(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
-    h_view = create_mirror_view(d_view);
+    const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
+    const bool sizeMismatch =
+        Impl::size_mismatch(h_view, h_view.rank_dynamic, new_extents);
+
+    if (sizeMismatch) {
+      ::Kokkos::realloc(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
+      h_view = create_mirror_view(d_view);
+    } else
+      ::Kokkos::deep_copy(d_view, typename t_dev::value_type{});
 
     /* Reset dirty flags */
     if (modified_flags.data() == nullptr) {
@@ -897,41 +932,31 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
               const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
               const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
               const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
+    const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
+    const bool sizeMismatch =
+        Impl::size_mismatch(h_view, h_view.rank_dynamic, new_extents);
+
     if (modified_flags.data() == nullptr) {
       modified_flags = t_modified_flags("DualView::modified_flags");
     }
     if (modified_flags(1) >= modified_flags(0)) {
       /* Resize on Device */
-      ::Kokkos::resize(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
-      h_view = create_mirror_view(d_view);
-
-      /* Mark Device copy as modified */
-      modified_flags(1) = modified_flags(1) + 1;
+      if (sizeMismatch) {
+        ::Kokkos::resize(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
+        h_view = create_mirror_view(d_view);
 
+        /* Mark Device copy as modified */
+        modified_flags(1) = modified_flags(1) + 1;
+      }
     } else {
       /* Realloc on Device */
-
-      ::Kokkos::realloc(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
-
-      const bool sizeMismatch =
-          (h_view.extent(0) != n0) || (h_view.extent(1) != n1) ||
-          (h_view.extent(2) != n2) || (h_view.extent(3) != n3) ||
-          (h_view.extent(4) != n4) || (h_view.extent(5) != n5) ||
-          (h_view.extent(6) != n6) || (h_view.extent(7) != n7);
-      if (sizeMismatch)
+      if (sizeMismatch) {
         ::Kokkos::resize(h_view, n0, n1, n2, n3, n4, n5, n6, n7);
+        d_view = create_mirror_view(typename t_dev::execution_space(), h_view);
 
-      t_host temp_view = create_mirror_view(d_view);
-
-      /* Remap on Host */
-      Kokkos::deep_copy(temp_view, h_view);
-
-      h_view = temp_view;
-
-      d_view = create_mirror_view(typename t_dev::execution_space(), h_view);
-
-      /* Mark Host copy as modified */
-      modified_flags(0) = modified_flags(0) + 1;
+        /* Mark Host copy as modified */
+        modified_flags(0) = modified_flags(0) + 1;
+      }
     }
   }
 
diff --git a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp
index c6323fef93694de1ee39d5784141bf6991f78bd7..b673c53a4ef8e8a760c613332418ae5d600a6812 100644
--- a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp
@@ -1140,7 +1140,8 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
     // to avoid incomplete type errors from usng Kokkos::Cuda directly.
     if (std::is_same<Kokkos::CudaUVMSpace,
                      typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::DynRankView<>::DynRankView: fence before UVM allocation");
     }
 #endif
     //------------------------------------------------------------
@@ -1154,7 +1155,8 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
 #if defined(KOKKOS_ENABLE_CUDA)
     if (std::is_same<Kokkos::CudaUVMSpace,
                      typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::DynRankView<>::DynRankView: fence after UVM allocation");
     }
 #endif
     //------------------------------------------------------------
@@ -1404,7 +1406,7 @@ class ViewMapping<
 
   template <class MemoryTraits>
   struct apply {
-    static_assert(Kokkos::Impl::is_memory_traits<MemoryTraits>::value, "");
+    static_assert(Kokkos::is_memory_traits<MemoryTraits>::value, "");
 
     using traits_type =
         Kokkos::ViewTraits<data_type, array_layout,
@@ -1574,7 +1576,7 @@ KOKKOS_INLINE_FUNCTION bool operator!=(const DynRankView<LT, LP...>& lhs,
 namespace Kokkos {
 namespace Impl {
 
-template <class OutputView, typename Enable = void>
+template <class OutputView, class Enable = void>
 struct DynRankViewFill {
   using const_value_type = typename OutputView::traits::const_value_type;
 
@@ -1693,9 +1695,11 @@ inline void deep_copy(
                    typename ViewTraits<DT, DP...>::value_type>::value,
       "deep_copy requires non-const type");
 
-  Kokkos::fence();
+  Kokkos::fence(
+      "Kokkos::deep_copy(DynRankView, value_type): fence before filling view");
   Kokkos::Impl::DynRankViewFill<DynRankView<DT, DP...> >(dst, value);
-  Kokkos::fence();
+  Kokkos::fence(
+      "Kokkos::deep_copy(DynRankView, value_type): fence after filling view");
 }
 
 /** \brief  Deep copy into a value in Host memory from a view.  */
@@ -1711,10 +1715,13 @@ inline void deep_copy(
 
   using src_traits       = ViewTraits<ST, SP...>;
   using src_memory_space = typename src_traits::memory_space;
-  Kokkos::fence();
+  Kokkos::fence(
+      "Kokkos::deep_copy(value_type, DynRankView): fence before copying "
+      "value");
   Kokkos::Impl::DeepCopy<HostSpace, src_memory_space>(&dst, src.data(),
                                                       sizeof(ST));
-  Kokkos::fence();
+  Kokkos::fence(
+      "Kokkos::deep_copy(value_type, DynRankView): fence after copying value");
 }
 
 //----------------------------------------------------------------------------
@@ -1744,14 +1751,14 @@ inline void deep_copy(
 
   enum {
     DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
   };
 
   enum {
     SrcExecCanAccessDst =
-        Kokkos::Impl::SpaceAccessibility<src_execution_space,
-                                         dst_memory_space>::accessible
+        Kokkos::SpaceAccessibility<src_execution_space,
+                                   dst_memory_space>::accessible
   };
 
   if ((void*)dst.data() != (void*)src.data()) {
@@ -1762,10 +1769,14 @@ inline void deep_copy(
     // memory then can byte-wise copy
     if (rank(src) == 0 && rank(dst) == 0) {
       using value_type = typename dst_type::value_type;
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "copying rank-0 views");
       Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
           dst.data(), src.data(), sizeof(value_type));
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "copying rank-0 views");
     } else if (std::is_same<
                    typename DstType::traits::value_type,
                    typename SrcType::traits::non_const_value_type>::value &&
@@ -1787,10 +1798,14 @@ inline void deep_copy(
                dst.extent(6) == src.extent(6) &&
                dst.extent(7) == src.extent(7)) {
       const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "copying rank-1 views");
       Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
           dst.data(), src.data(), nbytes);
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "copying rank-1 views");
     } else if (std::is_same<
                    typename DstType::traits::value_type,
                    typename SrcType::traits::non_const_value_type>::value &&
@@ -1817,29 +1832,43 @@ inline void deep_copy(
                dst.stride_6() == src.stride_6() &&
                dst.stride_7() == src.stride_7()) {
       const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "copying rank-1 views");
       Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
           dst.data(), src.data(), nbytes);
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "copying rank-1 views");
     } else if (DstExecCanAccessSrc) {
       // Copying data between views in accessible memory spaces and either
       // non-contiguous or incompatible shape.
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "remapping views of incompatible shape");
       Kokkos::Impl::DynRankViewRemap<dst_type, src_type>(dst, src);
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "remapping views of incompatible shape");
     } else if (SrcExecCanAccessDst) {
       // Copying data between views in accessible memory spaces and either
       // non-contiguous or incompatible shape.
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "remapping views of incompatible shape");
       Kokkos::Impl::DynRankViewRemap<dst_type, src_type, src_execution_space>(
           dst, src);
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "remapping views of incompatible shape");
     } else {
       Kokkos::Impl::throw_runtime_exception(
           "deep_copy given views that would require a temporary allocation");
     }
   } else {
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence due to same "
+        "src and dst");
   }
 }
 
diff --git a/packages/kokkos/containers/src/Kokkos_DynamicView.hpp b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp
index cc949d4c556ab4abd982ea5334fee870c42ef305..2c764f535c585a4f545300d619b83917f327f414 100644
--- a/packages/kokkos/containers/src/Kokkos_DynamicView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp
@@ -53,36 +53,201 @@
 namespace Kokkos {
 namespace Experimental {
 
-// Simple metafunction for choosing memory space
-// In the current implementation, if memory_space == CudaSpace,
-// use CudaUVMSpace for the chunk 'array' allocation, which
-// contains will contain pointers to chunks of memory allocated
-// in CudaSpace
 namespace Impl {
-template <class MemSpace>
-struct ChunkArraySpace {
-  using memory_space = MemSpace;
-};
 
-#ifdef KOKKOS_ENABLE_CUDA
-template <>
-struct ChunkArraySpace<Kokkos::CudaSpace> {
-  using memory_space = typename Kokkos::CudaUVMSpace;
-};
-#endif
-#ifdef KOKKOS_ENABLE_HIP
-template <>
-struct ChunkArraySpace<Kokkos::Experimental::HIPSpace> {
-  using memory_space = typename Kokkos::Experimental::HIPHostPinnedSpace;
-};
-#endif
-#ifdef KOKKOS_ENABLE_SYCL
-template <>
-struct ChunkArraySpace<Kokkos::Experimental::SYCLDeviceUSMSpace> {
-  using memory_space = typename Kokkos::Experimental::SYCLSharedUSMSpace;
+/// Utility class to manage memory for chunked arrays on the host and
+/// device. Allocates/deallocates memory on both the host and device along with
+/// providing utilities for creating mirrors and deep copying between them.
+template <typename MemorySpace, typename ValueType>
+struct ChunkedArrayManager {
+  using value_type   = ValueType;
+  using pointer_type = ValueType*;
+  using track_type   = Kokkos::Impl::SharedAllocationTracker;
+
+  ChunkedArrayManager()                           = default;
+  ChunkedArrayManager(ChunkedArrayManager const&) = default;
+  ChunkedArrayManager(ChunkedArrayManager&&)      = default;
+  ChunkedArrayManager& operator=(ChunkedArrayManager&&) = default;
+  ChunkedArrayManager& operator=(const ChunkedArrayManager&) = default;
+
+  template <typename Space, typename Value>
+  friend struct ChunkedArrayManager;
+
+  template <typename Space, typename Value>
+  inline ChunkedArrayManager(const ChunkedArrayManager<Space, Value>& rhs)
+      : m_valid(rhs.m_valid),
+        m_chunk_max(rhs.m_chunk_max),
+        m_chunks((ValueType**)(rhs.m_chunks)),
+        m_track(rhs.m_track),
+        m_chunk_size(rhs.m_chunk_size) {
+    static_assert(
+        Kokkos::Impl::MemorySpaceAccess<MemorySpace, Space>::assignable,
+        "Incompatible ChunkedArrayManager copy construction");
+  }
+
+  ChunkedArrayManager(const unsigned arg_chunk_max,
+                      const unsigned arg_chunk_size)
+      : m_chunk_max(arg_chunk_max), m_chunk_size(arg_chunk_size) {}
+
+ private:
+  struct ACCESSIBLE_TAG {};
+  struct INACCESSIBLE_TAG {};
+
+  ChunkedArrayManager(ACCESSIBLE_TAG, pointer_type* arg_chunks,
+                      const unsigned arg_chunk_max)
+      : m_valid(true), m_chunk_max(arg_chunk_max), m_chunks(arg_chunks) {}
+
+  ChunkedArrayManager(INACCESSIBLE_TAG, const unsigned arg_chunk_max,
+                      const unsigned arg_chunk_size)
+      : m_chunk_max(arg_chunk_max), m_chunk_size(arg_chunk_size) {}
+
+ public:
+  template <typename Space, typename Enable_ = void>
+  struct IsAccessibleFrom;
+
+  template <typename Space>
+  struct IsAccessibleFrom<
+      Space, typename std::enable_if_t<Kokkos::Impl::MemorySpaceAccess<
+                 MemorySpace, Space>::accessible>> : std::true_type {};
+
+  template <typename Space>
+  struct IsAccessibleFrom<
+      Space, typename std::enable_if_t<!Kokkos::Impl::MemorySpaceAccess<
+                 MemorySpace, Space>::accessible>> : std::false_type {};
+
+  template <typename Space>
+  static ChunkedArrayManager<Space, ValueType> create_mirror(
+      ChunkedArrayManager<MemorySpace, ValueType> const& other,
+      typename std::enable_if<IsAccessibleFrom<Space>::value>::type* =
+          nullptr) {
+    return ChunkedArrayManager<Space, ValueType>{
+        ACCESSIBLE_TAG{}, other.m_chunks, other.m_chunk_max};
+  }
+
+  template <typename Space>
+  static ChunkedArrayManager<Space, ValueType> create_mirror(
+      ChunkedArrayManager<MemorySpace, ValueType> const& other,
+      typename std::enable_if<!IsAccessibleFrom<Space>::value>::type* =
+          nullptr) {
+    using tag_type =
+        typename ChunkedArrayManager<Space, ValueType>::INACCESSIBLE_TAG;
+    return ChunkedArrayManager<Space, ValueType>{tag_type{}, other.m_chunk_max,
+                                                 other.m_chunk_size};
+  }
+
+ public:
+  void allocate_device(const std::string& label) {
+    if (m_chunks == nullptr) {
+      m_chunks = reinterpret_cast<pointer_type*>(MemorySpace().allocate(
+          label.c_str(), (sizeof(pointer_type) * (m_chunk_max + 2))));
+    }
+  }
+
+  void initialize() {
+    for (unsigned i = 0; i < m_chunk_max + 2; i++) {
+      m_chunks[i] = nullptr;
+    }
+    m_valid = true;
+  }
+
+ private:
+  /// Custom destroy functor for deallocating array chunks along with a linked
+  /// allocation
+  template <typename Space>
+  struct Destroy {
+    Destroy()               = default;
+    Destroy(Destroy&&)      = default;
+    Destroy(const Destroy&) = default;
+    Destroy& operator=(Destroy&&) = default;
+    Destroy& operator=(const Destroy&) = default;
+
+    Destroy(std::string label, value_type** arg_chunk,
+            const unsigned arg_chunk_max, const unsigned arg_chunk_size,
+            value_type** arg_linked)
+        : m_label(label),
+          m_chunks(arg_chunk),
+          m_linked(arg_linked),
+          m_chunk_max(arg_chunk_max),
+          m_chunk_size(arg_chunk_size) {}
+
+    void execute() {
+      // Destroy the array of chunk pointers.
+      // Two entries beyond the max chunks are allocation counters.
+      for (unsigned i = 0; i < m_chunk_max; i++) {
+        Space().deallocate(m_label.c_str(), m_chunks[i],
+                           sizeof(value_type) * m_chunk_size);
+      }
+      // Destroy the linked allocation if we have one.
+      if (m_linked != nullptr) {
+        Space().deallocate(m_label.c_str(), m_linked,
+                           (sizeof(value_type*) * (m_chunk_max + 2)));
+      }
+    }
+
+    void destroy_shared_allocation() { execute(); }
+
+    std::string m_label;
+    value_type** m_chunks = nullptr;
+    value_type** m_linked = nullptr;
+    unsigned m_chunk_max;
+    unsigned m_chunk_size;
+  };
+
+ public:
+  template <typename Space>
+  void allocate_with_destroy(const std::string& label,
+                             pointer_type* linked_allocation = nullptr) {
+    using destroy_type = Destroy<Space>;
+    using record_type =
+        Kokkos::Impl::SharedAllocationRecord<MemorySpace, destroy_type>;
+
+    // Allocate + 2 extra slots so that *m_chunk[m_chunk_max] ==
+    // num_chunks_alloc and *m_chunk[m_chunk_max+1] == extent This must match in
+    // Destroy's execute(...) method
+    record_type* const record = record_type::allocate(
+        MemorySpace(), label, (sizeof(pointer_type) * (m_chunk_max + 2)));
+    m_chunks = static_cast<pointer_type*>(record->data());
+    m_track.assign_allocated_record_to_uninitialized(record);
+
+    record->m_destroy = destroy_type(label, m_chunks, m_chunk_max, m_chunk_size,
+                                     linked_allocation);
+  }
+
+  pointer_type* get_ptr() const { return m_chunks; }
+
+  template <typename Space>
+  typename std::enable_if<!IsAccessibleFrom<Space>::value>::type deep_copy_to(
+      ChunkedArrayManager<Space, ValueType> const& other) {
+    Kokkos::Impl::DeepCopy<Space, MemorySpace>(
+        other.m_chunks, m_chunks, sizeof(pointer_type) * (m_chunk_max + 2));
+  }
+
+  template <typename Space>
+  typename std::enable_if<IsAccessibleFrom<Space>::value>::type deep_copy_to(
+      ChunkedArrayManager<Space, ValueType> const&) {
+    // no-op
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  pointer_type* operator+(int i) const { return m_chunks + i; }
+
+  KOKKOS_INLINE_FUNCTION
+  pointer_type& operator[](int i) const { return m_chunks[i]; }
+
+  track_type const& track() const { return m_track; }
+
+  KOKKOS_INLINE_FUNCTION
+  bool valid() const { return m_valid; }
+
+ private:
+  bool m_valid           = false;
+  unsigned m_chunk_max   = 0;
+  pointer_type* m_chunks = nullptr;
+  track_type m_track;
+  unsigned m_chunk_size = 0;
 };
-#endif
-}  // end namespace Impl
+
+} /* end namespace Impl */
 
 /** \brief Dynamic views are restricted to rank-one and no layout.
  *         Resize only occurs on host outside of parallel_regions.
@@ -93,6 +258,13 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
  public:
   using traits = Kokkos::ViewTraits<DataType, P...>;
 
+  using value_type   = typename traits::value_type;
+  using device_space = typename traits::memory_space;
+  using host_space =
+      typename Kokkos::Impl::HostMirror<device_space>::Space::memory_space;
+  using device_accessor = Impl::ChunkedArrayManager<device_space, value_type>;
+  using host_accessor   = Impl::ChunkedArrayManager<host_space, value_type>;
+
  private:
   template <class, class...>
   friend class DynamicView;
@@ -108,7 +280,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
                 "DynamicView only implemented for non-specialized View type");
 
   template <class Space, bool = Kokkos::Impl::MemorySpaceAccess<
-                             Space, typename traits::memory_space>::accessible>
+                             Space, device_space>::accessible>
   struct verify_space {
     KOKKOS_FORCEINLINE_FUNCTION static void check() {}
   };
@@ -123,9 +295,8 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
   };
 
  private:
-  track_type m_track;
-  typename traits::value_type** m_chunks =
-      nullptr;             // array of pointers to 'chunks' of memory
+  device_accessor m_chunks;
+  host_accessor m_chunks_host;
   unsigned m_chunk_shift;  // ceil(log2(m_chunk_size))
   unsigned m_chunk_mask;   // m_chunk_size - 1
   unsigned m_chunk_max;  // number of entries in the chunk array - each pointing
@@ -173,7 +344,8 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
 
   KOKKOS_INLINE_FUNCTION
   size_t allocation_extent() const noexcept {
-    uintptr_t n = *reinterpret_cast<const uintptr_t*>(m_chunks + m_chunk_max);
+    uintptr_t n =
+        *reinterpret_cast<const uintptr_t*>(m_chunks_host + m_chunk_max);
     return (n << m_chunk_shift);
   }
 
@@ -183,7 +355,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
   KOKKOS_INLINE_FUNCTION
   size_t size() const noexcept {
     size_t extent_0 =
-        *reinterpret_cast<const size_t*>(m_chunks + m_chunk_max + 1);
+        *reinterpret_cast<const size_t*>(m_chunks_host + m_chunk_max + 1);
     return extent_0;
   }
 
@@ -215,10 +387,10 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
   // Allocation tracking properties
 
   KOKKOS_INLINE_FUNCTION
-  int use_count() const { return m_track.use_count(); }
+  int use_count() const { return m_chunks_host.track().use_count(); }
 
   inline const std::string label() const {
-    return m_track.template get_label<typename traits::memory_space>();
+    return m_chunks_host.track().template get_label<host_space>();
   }
 
   //----------------------------------------------------------------------
@@ -285,13 +457,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
    *          up to the maximum number of chunks
    * */
   template <typename IntType>
-  inline typename std::enable_if<
-      std::is_integral<IntType>::value &&
-      Kokkos::Impl::MemorySpaceAccess<
-          Kokkos::HostSpace,
-          typename Impl::ChunkArraySpace<
-              typename traits::memory_space>::memory_space>::accessible>::type
-  resize_serial(IntType const& n) {
+  inline void resize_serial(IntType const& n) {
     using local_value_type   = typename traits::value_type;
     using value_pointer_type = local_value_type*;
 
@@ -304,37 +470,40 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
     }
 
     // *m_chunks[m_chunk_max] stores the current number of chunks being used
-    uintptr_t* const pc = reinterpret_cast<uintptr_t*>(m_chunks + m_chunk_max);
-    std::string _label =
-        m_track.template get_label<typename traits::memory_space>();
+    uintptr_t* const pc =
+        reinterpret_cast<uintptr_t*>(m_chunks_host + m_chunk_max);
+    std::string _label = m_chunks_host.track().template get_label<host_space>();
+
     if (*pc < NC) {
       while (*pc < NC) {
-        m_chunks[*pc] = reinterpret_cast<value_pointer_type>(
-            typename traits::memory_space().allocate(
+        m_chunks_host[*pc] =
+            reinterpret_cast<value_pointer_type>(device_space().allocate(
                 _label.c_str(), sizeof(local_value_type) << m_chunk_shift));
         ++*pc;
       }
     } else {
       while (NC + 1 <= *pc) {
         --*pc;
-        typename traits::memory_space().deallocate(
-            _label.c_str(), m_chunks[*pc],
-            sizeof(local_value_type) << m_chunk_shift);
-        m_chunks[*pc] = nullptr;
+        device_space().deallocate(_label.c_str(), m_chunks_host[*pc],
+                                  sizeof(local_value_type) << m_chunk_shift);
+        m_chunks_host[*pc] = nullptr;
       }
     }
-    // *m_chunks[m_chunk_max+1] stores the 'extent' requested by resize
+    // *m_chunks_host[m_chunk_max+1] stores the 'extent' requested by resize
     *(pc + 1) = n;
+
+    m_chunks_host.deep_copy_to(m_chunks);
   }
 
   KOKKOS_INLINE_FUNCTION bool is_allocated() const {
-    if (m_chunks == nullptr) {
-      return false;
-    } else {
-      // *m_chunks[m_chunk_max] stores the current number of chunks being used
+    if (m_chunks_host.valid()) {
+      // *m_chunks_host[m_chunk_max] stores the current number of chunks being
+      // used
       uintptr_t* const pc =
-          reinterpret_cast<uintptr_t*>(m_chunks + m_chunk_max);
+          reinterpret_cast<uintptr_t*>(m_chunks_host + m_chunk_max);
       return (*(pc + 1) > 0);
+    } else {
+      return false;
     }
   }
 
@@ -349,8 +518,8 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
 
   template <class RT, class... RP>
   DynamicView(const DynamicView<RT, RP...>& rhs)
-      : m_track(rhs.m_track),
-        m_chunks((typename traits::value_type**)rhs.m_chunks),
+      : m_chunks(rhs.m_chunks),
+        m_chunks_host(rhs.m_chunks_host),
         m_chunk_shift(rhs.m_chunk_shift),
         m_chunk_mask(rhs.m_chunk_mask),
         m_chunk_max(rhs.m_chunk_max),
@@ -361,63 +530,6 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
                   "Incompatible DynamicView copy construction");
   }
 
-  //----------------------------------------------------------------------
-
-  struct Destroy {
-    using local_value_type = typename traits::value_type;
-    std::string m_label;
-    local_value_type** m_chunks;
-    unsigned m_chunk_max;
-    bool m_destroy;
-    unsigned m_chunk_size;
-
-    // Initialize or destroy array of chunk pointers.
-    // Two entries beyond the max chunks are allocation counters.
-    inline void operator()(unsigned i) const {
-      if (m_destroy && i < m_chunk_max && nullptr != m_chunks[i]) {
-        typename traits::memory_space().deallocate(
-            m_label.c_str(), m_chunks[i],
-            sizeof(local_value_type) * m_chunk_size);
-      }
-      m_chunks[i] = nullptr;
-    }
-
-    void execute(bool arg_destroy) {
-      using Range = Kokkos::RangePolicy<typename HostSpace::execution_space>;
-
-      m_destroy = arg_destroy;
-
-      Kokkos::Impl::ParallelFor<Destroy, Range> closure(
-          *this,
-          Range(0, m_chunk_max + 2));  // Add 2 to 'destroy' extra slots storing
-                                       // num_chunks and extent; previously + 1
-
-      closure.execute();
-
-      typename traits::execution_space().fence();
-      // Impl::ChunkArraySpace< typename traits::memory_space
-      // >::memory_space::execution_space().fence();
-    }
-
-    void construct_shared_allocation() { execute(false); }
-
-    void destroy_shared_allocation() { execute(true); }
-
-    Destroy()               = default;
-    Destroy(Destroy&&)      = default;
-    Destroy(const Destroy&) = default;
-    Destroy& operator=(Destroy&&) = default;
-    Destroy& operator=(const Destroy&) = default;
-
-    Destroy(std::string label, typename traits::value_type** arg_chunk,
-            const unsigned arg_chunk_max, const unsigned arg_chunk_size)
-        : m_label(label),
-          m_chunks(arg_chunk),
-          m_chunk_max(arg_chunk_max),
-          m_destroy(false),
-          m_chunk_size(arg_chunk_size) {}
-  };
-
   /**\brief  Allocation constructor
    *
    *  Memory is allocated in chunks
@@ -427,10 +539,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
   explicit inline DynamicView(const std::string& arg_label,
                               const unsigned min_chunk_size,
                               const unsigned max_extent)
-      : m_track(),
-        m_chunks(nullptr)
-        // The chunk size is guaranteed to be a power of two
-        ,
+      :  // The chunk size is guaranteed to be a power of two
         m_chunk_shift(Kokkos::Impl::integral_power_of_two_that_contains(
             min_chunk_size))  // div ceil(log2(min_chunk_size))
         ,
@@ -440,28 +549,22 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
                     m_chunk_shift)  // max num pointers-to-chunks in array
         ,
         m_chunk_size(2 << (m_chunk_shift - 1)) {
-    using chunk_array_memory_space = typename Impl::ChunkArraySpace<
-        typename traits::memory_space>::memory_space;
-    // A functor to deallocate all of the chunks upon final destruction
-    using record_type =
-        Kokkos::Impl::SharedAllocationRecord<chunk_array_memory_space, Destroy>;
-
-    // Allocate chunk pointers and allocation counter
-    record_type* const record =
-        record_type::allocate(chunk_array_memory_space(), arg_label,
-                              (sizeof(pointer_type) * (m_chunk_max + 2)));
-    // Allocate + 2 extra slots so that *m_chunk[m_chunk_max] ==
-    // num_chunks_alloc and *m_chunk[m_chunk_max+1] == extent This must match in
-    // Destroy's execute(...) method
-
-    m_chunks = reinterpret_cast<pointer_type*>(record->data());
-
-    record->m_destroy = Destroy(arg_label, m_chunks, m_chunk_max, m_chunk_size);
+    m_chunks = device_accessor(m_chunk_max, m_chunk_size);
 
-    // Initialize to zero
-    record->m_destroy.construct_shared_allocation();
-
-    m_track.assign_allocated_record_to_uninitialized(record);
+    if (device_accessor::template IsAccessibleFrom<host_space>::value) {
+      m_chunks.template allocate_with_destroy<device_space>(arg_label);
+      m_chunks.initialize();
+      m_chunks_host =
+          device_accessor::template create_mirror<host_space>(m_chunks);
+    } else {
+      m_chunks.allocate_device(arg_label);
+      m_chunks_host =
+          device_accessor::template create_mirror<host_space>(m_chunks);
+      m_chunks_host.template allocate_with_destroy<device_space>(
+          arg_label, m_chunks.get_ptr());
+      m_chunks_host.initialize();
+      m_chunks_host.deep_copy_to(m_chunks);
+    }
   }
 };
 
@@ -487,8 +590,8 @@ inline void deep_copy(const View<T, DP...>& dst,
 
   enum {
     DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
   };
 
   if (DstExecCanAccessSrc) {
@@ -512,8 +615,8 @@ inline void deep_copy(const Kokkos::Experimental::DynamicView<T, DP...>& dst,
 
   enum {
     DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
   };
 
   if (DstExecCanAccessSrc) {
diff --git a/packages/kokkos/containers/src/Kokkos_ErrorReporter.hpp b/packages/kokkos/containers/src/Kokkos_ErrorReporter.hpp
index fbfaed9b1bcda2d22077947532f3abe303ea5533..18f026dc6ffcffc6c0b1884358ebf5a85012d40e 100644
--- a/packages/kokkos/containers/src/Kokkos_ErrorReporter.hpp
+++ b/packages/kokkos/containers/src/Kokkos_ErrorReporter.hpp
@@ -187,7 +187,8 @@ template <typename ReportType, typename DeviceType>
 void ErrorReporter<ReportType, DeviceType>::resize(const size_t new_size) {
   m_reports.resize(new_size);
   m_reporters.resize(new_size);
-  typename DeviceType::execution_space().fence();
+  typename DeviceType::execution_space().fence(
+      "Kokkos::Experimental::ErrorReporter::resize: fence after resizing");
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp
index 0f21a08ba3ba86ed176dc4c4535ef76c960e90bc..57bf745d4038de73b71654e518aa855e0faa1698 100644
--- a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp
@@ -116,8 +116,7 @@ KOKKOS_INLINE_FUNCTION void offsetview_verify_operator_bounds(
       This check should cover the case of Views that don't
       have the Unmanaged trait but were initialized by pointer. */
     if (tracker.has_record()) {
-      Kokkos::Impl::operator_bounds_error_on_device<MapType>(
-          map, Kokkos::Impl::has_printable_label_typedef<MapType>());
+      Kokkos::Impl::operator_bounds_error_on_device(map);
     } else {
       Kokkos::abort("OffsetView bounds error");
     }
@@ -1244,7 +1243,8 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
     // to avoid incomplete type errors from usng Kokkos::Cuda directly.
     if (std::is_same<Kokkos::CudaUVMSpace,
                      typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::OffsetView::OffsetView(): fence before UVM allocation");
     }
 #endif
     //------------------------------------------------------------
@@ -1256,7 +1256,8 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
 #if defined(KOKKOS_ENABLE_CUDA)
     if (std::is_same<Kokkos::CudaUVMSpace,
                      typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::OffsetView::OffsetView(): fence after UVM allocation");
     }
 #endif
     //------------------------------------------------------------
diff --git a/packages/kokkos/containers/src/Kokkos_ScatterView.hpp b/packages/kokkos/containers/src/Kokkos_ScatterView.hpp
index dcd4cf73e5d710bc427772a8a8de6384e80c9dae..79bc43b7393d85a1214e0ca3a8dc15861281e44e 100644
--- a/packages/kokkos/containers/src/Kokkos_ScatterView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_ScatterView.hpp
@@ -834,7 +834,7 @@ class ScatterView<DataType, Layout, DeviceType, Op, ScatterNonDuplicated,
     static_assert(std::is_same<typename dest_type::array_layout, Layout>::value,
                   "ScatterView contribute destination has different layout");
     static_assert(
-        Kokkos::Impl::SpaceAccessibility<
+        Kokkos::SpaceAccessibility<
             execution_space, typename dest_type::memory_space>::accessible,
         "ScatterView contribute destination memory space not accessible");
     if (dest.data() == internal_view.data()) return;
@@ -1061,7 +1061,7 @@ class ScatterView<DataType, Kokkos::LayoutRight, DeviceType, Op,
                                Kokkos::LayoutRight>::value,
                   "ScatterView deep_copy destination has different layout");
     static_assert(
-        Kokkos::Impl::SpaceAccessibility<
+        Kokkos::SpaceAccessibility<
             execution_space, typename dest_type::memory_space>::accessible,
         "ScatterView deep_copy destination memory space not accessible");
     bool is_equal = (dest.data() == internal_view.data());
@@ -1290,7 +1290,7 @@ class ScatterView<DataType, Kokkos::LayoutLeft, DeviceType, Op,
                                Kokkos::LayoutLeft>::value,
                   "ScatterView deep_copy destination has different layout");
     static_assert(
-        Kokkos::Impl::SpaceAccessibility<
+        Kokkos::SpaceAccessibility<
             execution_space, typename dest_type::memory_space>::accessible,
         "ScatterView deep_copy destination memory space not accessible");
     auto extent   = internal_view.extent(internal_view_type::rank - 1);
diff --git a/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
index 81be3ee2d3e836436a23f8808a07f9386bc3ac05..cd633e40310177b116f04220c7030545ba37039d 100644
--- a/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
+++ b/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
@@ -405,7 +405,9 @@ class StaticCrsGraph {
     Kokkos::parallel_for("Kokkos::StaticCrsGraph::create_block_partitioning",
                          Kokkos::RangePolicy<execution_space>(0, numRows()),
                          partitioner);
-    typename device_type::execution_space().fence();
+    typename device_type::execution_space().fence(
+        "Kokkos::StaticCrsGraph::create_block_partitioning:: fence after "
+        "partition");
 
     row_block_offsets = block_offsets;
   }
diff --git a/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp
index edb0e7261da93bb629cad4e9cc7c7d3118868288..a1601eee35869f5c26249dbf2ed325c4e84d5420 100644
--- a/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp
+++ b/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp
@@ -345,7 +345,8 @@ class UnorderedMap {
       const impl_value_type tmp = impl_value_type();
       Kokkos::deep_copy(m_values, tmp);
     }
-    { Kokkos::deep_copy(m_scalars, 0); }
+    Kokkos::deep_copy(m_scalars, 0);
+    m_size = 0;
   }
 
   KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const {
@@ -393,9 +394,9 @@ class UnorderedMap {
   ///
   /// This method has undefined behavior when erasable() is true.
   ///
-  /// Note that this is not a device function; it cannot be called in
+  /// Note that this is <i>not</i> a device function; it cannot be called in
   /// a parallel kernel.  The value is not stored as a variable; it
-  /// must be computed.
+  /// must be computed. m_size is a mutable cache of that value.
   size_type size() const {
     if (capacity() == 0u) return 0u;
     if (modified()) {
@@ -419,9 +420,13 @@ class UnorderedMap {
   bool begin_erase() {
     bool result = !erasable();
     if (is_insertable_map && result) {
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::UnorderedMap::begin_erase: fence before setting erasable "
+          "flag");
       set_flag(erasable_idx);
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::UnorderedMap::begin_erase: fence after setting erasable "
+          "flag");
     }
     return result;
   }
@@ -429,10 +434,12 @@ class UnorderedMap {
   bool end_erase() {
     bool result = erasable();
     if (is_insertable_map && result) {
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::UnorderedMap::end_erase: fence before erasing");
       Impl::UnorderedMapErase<declared_map_type> f(*this);
       f.apply();
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::UnorderedMap::end_erase: fence after erasing");
       reset_flag(erasable_idx);
     }
     return result;
diff --git a/packages/kokkos/containers/src/Kokkos_Vector.hpp b/packages/kokkos/containers/src/Kokkos_Vector.hpp
index a1fbba6b21c76b4bb7b2a63a4e3a863241a7cd74..88721bd89eb2fd86543c480727876a58fd888a56 100644
--- a/packages/kokkos/containers/src/Kokkos_Vector.hpp
+++ b/packages/kokkos/containers/src/Kokkos_Vector.hpp
@@ -119,12 +119,14 @@ class vector : public DualView<Scalar*, LayoutLeft, Arg1Type> {
     if (DV::template need_sync<typename DV::t_dev::device_type>()) {
       set_functor_host f(DV::h_view, val);
       parallel_for("Kokkos::vector::assign", n, f);
-      typename DV::t_host::execution_space().fence();
+      typename DV::t_host::execution_space().fence(
+          "Kokkos::vector::assign: fence after assigning values");
       DV::template modify<typename DV::t_host::device_type>();
     } else {
       set_functor f(DV::d_view, val);
       parallel_for("Kokkos::vector::assign", n, f);
-      typename DV::t_dev::execution_space().fence();
+      typename DV::t_dev::execution_space().fence(
+          "Kokkos::vector::assign: fence after assigning values");
       DV::template modify<typename DV::t_dev::device_type>();
     }
   }
diff --git a/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp b/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
index 6047e60f3dd080b8cfe456627ccc80266e7df66b..9512f2d4a20e509af321d315c8963693076a0d58 100644
--- a/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
+++ b/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
@@ -57,22 +57,10 @@
 namespace Kokkos {
 namespace Impl {
 
-KOKKOS_FORCEINLINE_FUNCTION
-unsigned rotate_left(unsigned i, int r) {
-  constexpr int size = static_cast<int>(sizeof(unsigned) * CHAR_BIT);
-  return r ? ((i << r) | (i >> (size - r))) : i;
-}
-
 KOKKOS_FORCEINLINE_FUNCTION
 unsigned rotate_right(unsigned i, int r) {
   constexpr int size = static_cast<int>(sizeof(unsigned) * CHAR_BIT);
-  // FIXME_SYCL llvm.fshr.i32 missing
-  // (https://github.com/intel/llvm/issues/3308)
-#ifdef __SYCL_DEVICE_ONLY__
-  return rotate_left(i, size - r);
-#else
   return r ? ((i >> r) | (i << (size - r))) : i;
-#endif
 }
 
 template <typename Bitset>
diff --git a/packages/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp b/packages/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp
index 367ab338572064f167c3c50f447e4d27efff6999..fdd78e4e5f99dc4748c093d274c1e62f9316261a 100644
--- a/packages/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp
+++ b/packages/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp
@@ -75,7 +75,7 @@ uint32_t fmix32(uint32_t h) {
 
 KOKKOS_INLINE_FUNCTION
 uint32_t MurmurHash3_x86_32(const void* key, int len, uint32_t seed) {
-  const uint8_t* data = (const uint8_t*)key;
+  const uint8_t* data = static_cast<const uint8_t*>(key);
   const int nblocks   = len / 4;
 
   uint32_t h1 = seed;
diff --git a/packages/kokkos/containers/unit_tests/TestDualView.hpp b/packages/kokkos/containers/unit_tests/TestDualView.hpp
index 3eee85ed10bd81bc8b511afa9f0fbde7ba244b8f..e22564aa5c24e569ae98d972fe5526a35cc741a6 100644
--- a/packages/kokkos/containers/unit_tests/TestDualView.hpp
+++ b/packages/kokkos/containers/unit_tests/TestDualView.hpp
@@ -49,7 +49,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <cstdio>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <Kokkos_DualView.hpp>
 
 namespace Test {
diff --git a/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp b/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp
index dd0199ed81c75dcee42b964ac0bb1c246175ed01..a8d62bd24cad46531f2b4814f4d832c08758fe10 100644
--- a/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp
+++ b/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp
@@ -702,6 +702,11 @@ class TestDynViewAPI {
 
   using View0 = Kokkos::View<T, device>;
   using View1 = Kokkos::View<T*, device>;
+  using View2 = Kokkos::View<T**, device>;
+  using View3 = Kokkos::View<T***, device>;
+  using View4 = Kokkos::View<T****, device>;
+  using View5 = Kokkos::View<T*****, device>;
+  using View6 = Kokkos::View<T******, device>;
   using View7 = Kokkos::View<T*******, device>;
 
   using host_view_space = typename View0::host_mirror_space;
@@ -1065,7 +1070,7 @@ class TestDynViewAPI {
 
     dView0 d_uninitialized(
         Kokkos::view_alloc(Kokkos::WithoutInitializing, "uninit"), 10, 20);
-    ASSERT_TRUE(d_uninitialized.data() != nullptr);
+    ASSERT_NE(d_uninitialized.data(), nullptr);
     ASSERT_EQ(d_uninitialized.rank(), 2);
     ASSERT_EQ(d_uninitialized.extent(0), 10);
     ASSERT_EQ(d_uninitialized.extent(1), 20);
@@ -1075,14 +1080,14 @@ class TestDynViewAPI {
     hView0 hx, hy, hz;
 
     ASSERT_TRUE(Kokkos::is_dyn_rank_view<dView0>::value);
-    ASSERT_FALSE(Kokkos::is_dyn_rank_view<Kokkos::View<double> >::value);
-
-    ASSERT_TRUE(dx.data() == nullptr);  // Okay with UVM
-    ASSERT_TRUE(dy.data() == nullptr);  // Okay with UVM
-    ASSERT_TRUE(dz.data() == nullptr);  // Okay with UVM
-    ASSERT_TRUE(hx.data() == nullptr);
-    ASSERT_TRUE(hy.data() == nullptr);
-    ASSERT_TRUE(hz.data() == nullptr);
+    ASSERT_FALSE(Kokkos::is_dyn_rank_view<Kokkos::View<double>>::value);
+
+    ASSERT_EQ(dx.data(), nullptr);  // Okay with UVM
+    ASSERT_EQ(dy.data(), nullptr);  // Okay with UVM
+    ASSERT_EQ(dz.data(), nullptr);  // Okay with UVM
+    ASSERT_EQ(hx.data(), nullptr);
+    ASSERT_EQ(hy.data(), nullptr);
+    ASSERT_EQ(hz.data(), nullptr);
     ASSERT_EQ(dx.extent(0), 0u);  // Okay with UVM
     ASSERT_EQ(dy.extent(0), 0u);  // Okay with UVM
     ASSERT_EQ(dz.extent(0), 0u);  // Okay with UVM
@@ -1153,11 +1158,11 @@ class TestDynViewAPI {
 
     ASSERT_EQ(dx.use_count(), size_t(2));
 
-    ASSERT_FALSE(dx.data() == nullptr);
-    ASSERT_FALSE(const_dx.data() == nullptr);
-    ASSERT_FALSE(unmanaged_dx.data() == nullptr);
-    ASSERT_FALSE(unmanaged_from_ptr_dx.data() == nullptr);
-    ASSERT_FALSE(dy.data() == nullptr);
+    ASSERT_NE(dx.data(), nullptr);
+    ASSERT_NE(const_dx.data(), nullptr);
+    ASSERT_NE(unmanaged_dx.data(), nullptr);
+    ASSERT_NE(unmanaged_from_ptr_dx.data(), nullptr);
+    ASSERT_NE(dy.data(), nullptr);
     ASSERT_NE(dx, dy);
 
     ASSERT_EQ(dx.extent(0), unsigned(N0));
@@ -1317,17 +1322,17 @@ class TestDynViewAPI {
     ASSERT_NE(dx, dz);
 
     dx = dView0();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_FALSE(dy.data() == nullptr);
-    ASSERT_FALSE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_NE(dy.data(), nullptr);
+    ASSERT_NE(dz.data(), nullptr);
     dy = dView0();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_TRUE(dy.data() == nullptr);
-    ASSERT_FALSE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_EQ(dy.data(), nullptr);
+    ASSERT_NE(dz.data(), nullptr);
     dz = dView0();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_TRUE(dy.data() == nullptr);
-    ASSERT_TRUE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_EQ(dy.data(), nullptr);
+    ASSERT_EQ(dz.data(), nullptr);
 
     // View - DynRankView Interoperability tests
     // deep_copy from view to dynrankview
@@ -1367,7 +1372,7 @@ class TestDynViewAPI {
   static void check_auto_conversion_to_const(
       const Kokkos::DynRankView<const DataType, device>& arg_const,
       const Kokkos::DynRankView<DataType, device>& arg) {
-    ASSERT_TRUE(arg_const == arg);
+    ASSERT_EQ(arg_const, arg);
   }
 
   static void run_test_allocated() {
@@ -1396,8 +1401,8 @@ class TestDynViewAPI {
     const_typeX xc = x;
     const_typeR xr = x;
 
-    ASSERT_TRUE(xc == x);
-    ASSERT_TRUE(x == xc);
+    ASSERT_EQ(xc, x);
+    ASSERT_EQ(x, xc);
 
     // For CUDA the constant random access View does not return
     // an lvalue reference due to retrieving through texture cache
@@ -1406,7 +1411,7 @@ class TestDynViewAPI {
     if (!std::is_same<typename device::execution_space, Kokkos::Cuda>::value)
 #endif
     {
-      ASSERT_TRUE(x.data() == xr.data());
+      ASSERT_EQ(x.data(), xr.data());
     }
 
     // typeX xf = xc ; // setting non-const from const must not compile
@@ -1659,29 +1664,29 @@ class TestDynViewAPI {
     const_svector_right_type cvr3 =
         Kokkos::subdynrankview(mv, Kokkos::ALL(), 2);
 
-    ASSERT_TRUE(&v1[0] == &v1(0));
-    ASSERT_TRUE(&v1[0] == &mv(0, 0));
-    ASSERT_TRUE(&v2[0] == &mv(0, 1));
-    ASSERT_TRUE(&v3[0] == &mv(0, 2));
-
-    ASSERT_TRUE(&cv1[0] == &mv(0, 0));
-    ASSERT_TRUE(&cv2[0] == &mv(0, 1));
-    ASSERT_TRUE(&cv3[0] == &mv(0, 2));
-
-    ASSERT_TRUE(&vr1[0] == &mv(0, 0));
-    ASSERT_TRUE(&vr2[0] == &mv(0, 1));
-    ASSERT_TRUE(&vr3[0] == &mv(0, 2));
-
-    ASSERT_TRUE(&cvr1[0] == &mv(0, 0));
-    ASSERT_TRUE(&cvr2[0] == &mv(0, 1));
-    ASSERT_TRUE(&cvr3[0] == &mv(0, 2));
-
-    ASSERT_TRUE(&mv1(0, 0) == &mv(1, 2));
-    ASSERT_TRUE(&mv1(1, 1) == &mv(2, 3));
-    ASSERT_TRUE(&mv1(3, 2) == &mv(4, 4));
-    ASSERT_TRUE(&mvr1(0, 0) == &mv_right(1, 2));
-    ASSERT_TRUE(&mvr1(1, 1) == &mv_right(2, 3));
-    ASSERT_TRUE(&mvr1(3, 2) == &mv_right(4, 4));
+    ASSERT_EQ(&v1[0], &v1(0));
+    ASSERT_EQ(&v1[0], &mv(0, 0));
+    ASSERT_EQ(&v2[0], &mv(0, 1));
+    ASSERT_EQ(&v3[0], &mv(0, 2));
+
+    ASSERT_EQ(&cv1[0], &mv(0, 0));
+    ASSERT_EQ(&cv2[0], &mv(0, 1));
+    ASSERT_EQ(&cv3[0], &mv(0, 2));
+
+    ASSERT_EQ(&vr1[0], &mv(0, 0));
+    ASSERT_EQ(&vr2[0], &mv(0, 1));
+    ASSERT_EQ(&vr3[0], &mv(0, 2));
+
+    ASSERT_EQ(&cvr1[0], &mv(0, 0));
+    ASSERT_EQ(&cvr2[0], &mv(0, 1));
+    ASSERT_EQ(&cvr3[0], &mv(0, 2));
+
+    ASSERT_EQ(&mv1(0, 0), &mv(1, 2));
+    ASSERT_EQ(&mv1(1, 1), &mv(2, 3));
+    ASSERT_EQ(&mv1(3, 2), &mv(4, 4));
+    ASSERT_EQ(&mvr1(0, 0), &mv_right(1, 2));
+    ASSERT_EQ(&mvr1(1, 1), &mv_right(2, 3));
+    ASSERT_EQ(&mvr1(3, 2), &mv_right(4, 4));
 
     const_svector_type c_cv1(v1);
     typename svector_type::const_type c_cv2(v2);
diff --git a/packages/kokkos/containers/unit_tests/TestDynamicView.hpp b/packages/kokkos/containers/unit_tests/TestDynamicView.hpp
index f018793dd6f3b162acbf9db20174c47ac75fc1c0..023bf92f62b48bc46878209e6c5ef6eccedeb726 100644
--- a/packages/kokkos/containers/unit_tests/TestDynamicView.hpp
+++ b/packages/kokkos/containers/unit_tests/TestDynamicView.hpp
@@ -52,7 +52,7 @@
 #include <Kokkos_Core.hpp>
 
 #include <Kokkos_DynamicView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 namespace Test {
 
diff --git a/packages/kokkos/containers/unit_tests/TestOffsetView.hpp b/packages/kokkos/containers/unit_tests/TestOffsetView.hpp
index 9ddc226e291f6e7dc7d6bc960fad470fafeb9974..24a43e1ebc72820dbd84dd6e2931837cabfaecba 100644
--- a/packages/kokkos/containers/unit_tests/TestOffsetView.hpp
+++ b/packages/kokkos/containers/unit_tests/TestOffsetView.hpp
@@ -50,7 +50,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <cstdio>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <Kokkos_OffsetView.hpp>
 #include <KokkosExp_MDRangePolicy.hpp>
 
diff --git a/packages/kokkos/containers/unit_tests/TestScatterView.hpp b/packages/kokkos/containers/unit_tests/TestScatterView.hpp
index fdbce2d492009cf38d5491398d77423108edc6a5..342ce2af48afe2cba3737db653f67957d04a51d4 100644
--- a/packages/kokkos/containers/unit_tests/TestScatterView.hpp
+++ b/packages/kokkos/containers/unit_tests/TestScatterView.hpp
@@ -118,11 +118,51 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
       scatter_access(k, 3)++;
       scatter_access(k, 4)--;
       scatter_access(k, 5) -= 5;
+// Workaround Intel 17 compiler bug which sometimes add random
+// instruction alignment which makes the lock instruction
+// illegal. Seems to be mostly just for unsigned int atomics.
+// Looking at the assembly the compiler
+// appears to insert cache line alignment for the instruction.
+// Isn't restricted to specific archs. Seen it on SNB and SKX, but for
+// different code. Another occurrence was with Desul atomics in
+// a different unit test. This one here happens without desul atomics.
+// Inserting an assembly nop instruction changes the alignment and
+// works round this.
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       scatter_access_atomic(k, 6) += 2;
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       scatter_access_atomic(k, 7)++;
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       scatter_access_atomic(k, 8)--;
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       --scatter_access_atomic(k, 9);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       ++scatter_access_atomic(k, 10);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       scatter_access(k, 11) -= 3;
     }
   }
diff --git a/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp b/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
index a9a178f95e7b7fedabcb7b00b292d88603ff3f77..c9a3eed90c372fcd4211d0a46868fe8bcc061614 100644
--- a/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
+++ b/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
@@ -180,8 +180,6 @@ void run_test_graph3(size_t B, size_t N) {
 
   std::vector<size_t> sizes(LENGTH);
 
-  size_t total_length = 0;
-
   for (size_t i = 0; i < LENGTH; ++i) {
     sizes[i] = rand() % 1000;
   }
@@ -189,10 +187,6 @@ void run_test_graph3(size_t B, size_t N) {
   sizes[1]    = N;
   sizes[1998] = N;
 
-  for (size_t i = 0; i < LENGTH; ++i) {
-    total_length += sizes[i];
-  }
-
   int C    = 0;
   dView dx = Kokkos::create_staticcrsgraph<dView>("test", sizes);
   dx.create_block_partitioning(B, C);
diff --git a/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp b/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp
index 4413cfbc80e31271d1e2b830976796ade24aaa9a..8009b996566322147bcd5cfe257dd858b72819bb 100644
--- a/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp
+++ b/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp
@@ -295,10 +295,8 @@ void test_deep_copy(uint32_t num_nodes) {
 }
 
 // FIXME_SYCL wrong results on Nvidia GPUs but correct on Host and Intel GPUs
-// FIXME_HIP
 // WORKAROUND MSVC
-#if !(defined(KOKKOS_ENABLE_HIP) && (HIP_VERSION < 401)) && \
-    !defined(_WIN32) && !defined(KOKKOS_ENABLE_SYCL)
+#if !defined(_WIN32) && !defined(KOKKOS_ENABLE_SYCL)
 TEST(TEST_CATEGORY, UnorderedMap_insert) {
   for (int i = 0; i < 500; ++i) {
     test_insert<TEST_EXECSPACE>(100000, 90000, 100, true);
@@ -329,6 +327,23 @@ TEST(TEST_CATEGORY, UnorderedMap_valid_empty) {
   ASSERT_TRUE(n.is_allocated());
 }
 
+TEST(TEST_CATEGORY, UnorderedMap_clear_zero_size) {
+  using Map =
+      Kokkos::UnorderedMap<int, void, Kokkos::DefaultHostExecutionSpace>;
+
+  Map m(11);
+  ASSERT_EQ(0u, m.size());
+
+  m.insert(2);
+  m.insert(3);
+  m.insert(5);
+  m.insert(7);
+  ASSERT_EQ(4u, m.size());
+
+  m.clear();
+  ASSERT_EQ(0u, m.size());
+}
+
 }  // namespace Test
 
 #endif  // KOKKOS_TEST_UNORDERED_MAP_HPP
diff --git a/packages/kokkos/core/cmake/KokkosCore_config.h.in b/packages/kokkos/core/cmake/KokkosCore_config.h.in
deleted file mode 100644
index f0835772b864faf0126796c75f5f1e9d02f95e28..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/cmake/KokkosCore_config.h.in
+++ /dev/null
@@ -1,104 +0,0 @@
-/* The trivial 'src/build_common.sh' creates a config
- * that must stay in sync with this file.
- */
-#cmakedefine KOKKOS_FOR_SIERRA
-
-#if !defined(KOKKOS_FOR_SIERRA)
-
-#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
-#error \
-    "Don't include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
-#else
-#define KOKKOS_CORE_CONFIG_H
-#endif
-
-#cmakedefine KOKKOS_ENABLE_CUDA
-#cmakedefine KOKKOS_ENABLE_HIP
-#cmakedefine KOKKOS_ENABLE_OPENMP
-#cmakedefine KOKKOS_ENABLE_THREADS
-#cmakedefine KOKKOS_ENABLE_SERIAL
-#cmakedefine KOKKOS_ENABLE_Winthread
-
-#cmakedefine KOKKOS_ENABLE_HWLOC
-#cmakedefine KOKKOS_ENABLE_HBWSPACE
-#cmakedefine KOKKOS_ENABLE_LIBRT
-
-#cmakedefine KOKKOS_ENABLE_DEBUG
-#cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
-#cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
-#cmakedefine KOKKOS_ENABLE_PROFILING_LOAD_PRINT
-#cmakedefine KOKKOS_ENABLE_TUNING
-
-#cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
-
-#ifdef KOKKOS_ENABLE_CUDA
-
-#cmakedefine KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
-
-// mfh 16 Sep 2014: If passed in on the command line, that overrides
-// any value of KOKKOS_USE_CUDA_UVM here.  Doing this should prevent build
-// warnings like this one:
-//
-// packages/kokkos/core/src/KokkosCore_config.h:13:1: warning:
-// "KOKKOS_USE_CUDA_UVM" redefined
-//
-// At some point, we should edit the test-build scripts in
-// Trilinos/cmake/ctest/drivers/perseus/, and take
-// -DKOKKOS_USE_CUDA_UVM from the command-line arguments there.  I
-// hesitate to do that now, because I'm not sure if all the files are
-// including KokkosCore_config.h (or a header file that includes it) like
-// they should.
-#ifndef KOKKOS_USE_CUDA_UVM
-#cmakedefine KOKKOS_USE_CUDA_UVM
-#endif
-
-#cmakedefine KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-
-#cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA
-
-#endif
-
-#cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND
-
-#ifndef __CUDA_ARCH__
-#cmakedefine KOKKOS_ENABLE_ISA_X86_64
-#cmakedefine KOKKOS_ENABLE_ISA_KNC
-#cmakedefine KOKKOS_ENABLE_ISA_POWERPCLE
-#endif
-
-#ifdef KOKKOS_ENABLE_HIP
-#cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
-#endif
-
-#cmakedefine KOKKOS_ARCH_ARMV80 1
-#cmakedefine KOKKOS_ARCH_ARMV81 1
-#cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX 1
-#cmakedefine KOKKOS_ARCH_AVX 1
-#cmakedefine KOKKOS_ARCH_AVX2 1
-#cmakedefine KOKKOS_ARCH_AVX512MIC 1
-#cmakedefine KOKKOS_ARCH_AVX512XEON 1
-#cmakedefine KOKKOS_ARCH_KNC 1
-#cmakedefine KOKKOS_ARCH_POWER8 1
-#cmakedefine KOKKOS_ARCH_POWER9 1
-#cmakedefine KOKKOS_ARCH_KEPLER 1
-#cmakedefine KOKKOS_ARCH_KEPLER30 1
-#cmakedefine KOKKOS_ARCH_KEPLER32 1
-#cmakedefine KOKKOS_ARCH_KEPLER35 1
-#cmakedefine KOKKOS_ARCH_KEPLER37 1
-#cmakedefine KOKKOS_ARCH_MAXWELL 1
-#cmakedefine KOKKOS_ARCH_MAXWELL50 1
-#cmakedefine KOKKOS_ARCH_MAXWELL52 1
-#cmakedefine KOKKOS_ARCH_MAXWELL53 1
-#cmakedefine KOKKOS_ARCH_PASCAL 1
-#cmakedefine KOKKOS_ARCH_PASCAL60 1
-#cmakedefine KOKKOS_ARCH_PASCAL61 1
-#cmakedefine KOKKOS_ARCH_VOLTA70 1
-
-// TODO: These are currently not used in Kokkos.  Should they be removed?
-#cmakedefine KOKKOS_ENABLE_MPI
-#cmakedefine KOKKOS_ENABLE_CUSPARSE
-
-// TODO: No longer options in Kokkos.  Need to be removed.
-#cmakedefine KOKKOS_USING_DEPRECATED_VIEW
-
-#endif  // !defined(KOKKOS_FOR_SIERRA)
diff --git a/packages/kokkos/core/perf_test/CMakeLists.txt b/packages/kokkos/core/perf_test/CMakeLists.txt
index 9ff4b6006da8cb0358f2a9e53810b79ce59e8b02..a7c57a94346d74db97c8c320e0f3669bb2cc68cc 100644
--- a/packages/kokkos/core/perf_test/CMakeLists.txt
+++ b/packages/kokkos/core/perf_test/CMakeLists.txt
@@ -10,9 +10,7 @@
 #INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src")
 
 # FIXME_OPENMPTARGET - the NVIDIA HPC compiler nvc++ in the OpenMPTarget backend does not pass the perf_tests.
-IF (KOKKOS_ENABLE_OPENMPTARGET
-    AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI
-         OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
+IF (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
   RETURN()
 ENDIF()
 
diff --git a/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp b/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
index dee21fd7a575bd5aa0f6838980c670510f475cab..b534c32c52c691f4c65c5442d89a9381516391f1 100644
--- a/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
+++ b/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
@@ -231,7 +231,7 @@ void run_test_gramschmidt(int exp_beg, int exp_end, int num_trials,
 
     std::cout << label_gramschmidt << " , " << parallel_work_length << " , "
               << min_seconds << " , " << (min_seconds / parallel_work_length)
-              << std::endl;
+              << ", " << avg_seconds << std::endl;
   }
 }
 
diff --git a/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp b/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp
index c431c2b0c86d30192edc63d7dfbc447887f227cf..24c1898e0a16a4149c21df08e82c3da54ef0a25a 100644
--- a/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp
+++ b/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp
@@ -280,7 +280,7 @@ void run_test_hexgrad(int exp_beg, int exp_end, int num_trials,
 
     std::cout << label_hexgrad << " , " << parallel_work_length << " , "
               << min_seconds << " , " << (min_seconds / parallel_work_length)
-              << std::endl;
+              << avg_seconds << std::endl;
   }
 }
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp b/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
index 50bbc78a6b75815ad59ea73c0077dc27ae2dccfa..5b7c2a7a03907f8f0c854482c06ef155441c097d 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
@@ -205,7 +205,7 @@ TEST(default_exec, overlap_range_policy) {
   double time_end = timer.seconds();
 
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE((time_end > 1.5 * time_overlap));
+    ASSERT_GT(time_end, 1.5 * time_overlap);
   }
   printf("Time RangePolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end,
          time_overlap);
@@ -238,7 +238,7 @@ TEST(default_exec, overlap_range_policy) {
   double time_not_fenced = timer.seconds();
   Kokkos::fence();
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_fenced > 2.0 * time_not_fenced);
+    ASSERT_GT(time_fenced, 2.0 * time_not_fenced);
   }
 
   timer.reset();
@@ -280,7 +280,7 @@ TEST(default_exec, overlap_range_policy) {
   ASSERT_EQ(h_result2(), h_result());
 
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce);
+    ASSERT_LT(time_overlapped_reduce, 1.5 * time_no_overlapped_reduce);
   }
   printf("Time RangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",
          time_no_overlapped_reduce, time_overlapped_reduce);
@@ -378,7 +378,7 @@ TEST(default_exec, overlap_mdrange_policy) {
   double time_end = timer.seconds();
 
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE((time_end > 1.5 * time_overlap));
+    ASSERT_GT(time_end, 1.5 * time_overlap);
   }
   printf("Time MDRangePolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end,
          time_overlap);
@@ -413,7 +413,7 @@ TEST(default_exec, overlap_mdrange_policy) {
   double time_not_fenced = timer.seconds();
   Kokkos::fence();
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_fenced > 2.0 * time_not_fenced);
+    ASSERT_GT(time_fenced, 2.0 * time_not_fenced);
   }
 
   timer.reset();
@@ -459,7 +459,7 @@ TEST(default_exec, overlap_mdrange_policy) {
   ASSERT_EQ(h_result2(), h_result());
 
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce);
+    ASSERT_LT(time_overlapped_reduce, 1.5 * time_no_overlapped_reduce);
   }
   printf("Time MDRangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",
          time_no_overlapped_reduce, time_overlapped_reduce);
@@ -548,7 +548,7 @@ TEST(default_exec, overlap_team_policy) {
   double time_end = timer.seconds();
 
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE((time_end > 1.5 * time_overlap));
+    ASSERT_GT(time_end, 1.5 * time_overlap);
   }
   printf("Time TeamPolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end,
          time_overlap);
@@ -581,7 +581,7 @@ TEST(default_exec, overlap_team_policy) {
   double time_not_fenced = timer.seconds();
   Kokkos::fence();
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_fenced > 2.0 * time_not_fenced);
+    ASSERT_GT(time_fenced, 2.0 * time_not_fenced);
   }
   timer.reset();
   Kokkos::parallel_reduce(
@@ -622,7 +622,7 @@ TEST(default_exec, overlap_team_policy) {
   ASSERT_EQ(h_result2(), h_result());
 
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce);
+    ASSERT_LT(time_overlapped_reduce, 1.5 * time_no_overlapped_reduce);
   }
   printf("Time TeamPolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",
          time_no_overlapped_reduce, time_overlapped_reduce);
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp
index 550316bec997121a58a8b44f6df8efdced16a623..555a05ea279cd2280c510a03976dd75e8ee171f2 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp
@@ -120,7 +120,8 @@ void run_allocateview_tests(int N, int R) {
   {
     Kokkos::Timer timer;
     for (int r = 0; r < R; r++) {
-      double* a_ptr = (double*)Kokkos::kokkos_malloc("A", sizeof(double) * N8);
+      double* a_ptr =
+          static_cast<double*>(Kokkos::kokkos_malloc("A", sizeof(double) * N8));
       Kokkos::parallel_for(
           N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 0.0; });
       Kokkos::fence();
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp
index afeeb643569ecd1a981132bed08944288ec3ca72..b0562f2fd12227f1e3270c332cd3f1a6c4298fad 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp
@@ -47,10 +47,18 @@
 namespace Test {
 
 TEST(default_exec, ViewResize_Rank8) {
+// FIXME_SYCL Avoid running out of resources on the CUDA GPU used in the CI
+#ifdef KOKKOS_ENABLE_SYCL
+  printf("Resize View Performance for LayoutLeft:\n");
+  run_resizeview_tests8<Kokkos::LayoutLeft>(9, 1);
+  printf("Resize View Performance for LayoutRight:\n");
+  run_resizeview_tests8<Kokkos::LayoutRight>(9, 1);
+#else
   printf("Resize View Performance for LayoutLeft:\n");
   run_resizeview_tests8<Kokkos::LayoutLeft>(10, 1);
   printf("Resize View Performance for LayoutRight:\n");
   run_resizeview_tests8<Kokkos::LayoutRight>(10, 1);
+#endif
 }
 
 }  // namespace Test
diff --git a/packages/kokkos/core/perf_test/test_atomic.cpp b/packages/kokkos/core/perf_test/test_atomic.cpp
index 59820f3bdd2e83291dfd524325d3b7be6ba918ef..54824e5b39b91456a81e541e39a441d20b2879a7 100644
--- a/packages/kokkos/core/perf_test/test_atomic.cpp
+++ b/packages/kokkos/core/perf_test/test_atomic.cpp
@@ -47,7 +47,7 @@
 #include <cstdlib>
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 using exec_space = Kokkos::DefaultExecutionSpace;
 
@@ -401,7 +401,7 @@ template <class T>
 void Loop(int loop, int test, const char* type_name) {
   LoopVariant<T>(loop, test);
 
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
   T res       = LoopVariant<T>(loop, test);
   double time = timer.seconds();
 
diff --git a/packages/kokkos/core/perf_test/test_atomic_minmax_simple.cpp b/packages/kokkos/core/perf_test/test_atomic_minmax_simple.cpp
index eec1c8eacc7779121fd54e23d1ad7e4efa80902c..4086ef58163ff4ec144d2c151241952752d3453d 100644
--- a/packages/kokkos/core/perf_test/test_atomic_minmax_simple.cpp
+++ b/packages/kokkos/core/perf_test/test_atomic_minmax_simple.cpp
@@ -12,13 +12,13 @@
 #include <typeinfo>
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 using exec_space = Kokkos::DefaultExecutionSpace;
 
 template <typename T>
 void test(const int length) {
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
 
   using vector = Kokkos::View<T*, exec_space>;
 
diff --git a/packages/kokkos/core/perf_test/test_mempool.cpp b/packages/kokkos/core/perf_test/test_mempool.cpp
index 9aab119774c49d99ec112c527c79364a9c02ddc6..7887d4ba55196ddbc283baa023884e2f43a48991 100644
--- a/packages/kokkos/core/perf_test/test_mempool.cpp
+++ b/packages/kokkos/core/perf_test/test_mempool.cpp
@@ -48,7 +48,7 @@
 #include <limits>
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 using ExecSpace   = Kokkos::DefaultExecutionSpace;
 using MemorySpace = Kokkos::DefaultExecutionSpace::memory_space;
@@ -100,7 +100,7 @@ struct TestFunctor {
 
       const unsigned size_alloc = chunk * (1 + (j % chunk_span));
 
-      ptrs(j) = (uintptr_t)pool.allocate(size_alloc);
+      ptrs(j) = reinterpret_cast<uintptr_t>(pool.allocate(size_alloc));
 
       if (ptrs(j)) ++update;
     }
@@ -129,7 +129,7 @@ struct TestFunctor {
 
       const unsigned size_alloc = chunk * (1 + (j % chunk_span));
 
-      pool.deallocate((void*)ptrs(j), size_alloc);
+      pool.deallocate(reinterpret_cast<void*>(ptrs(j)), size_alloc);
     }
   }
 
@@ -153,9 +153,9 @@ struct TestFunctor {
         for (unsigned k = 0; k < repeat_inner; ++k) {
           const unsigned size_alloc = chunk * (1 + (j % chunk_span));
 
-          pool.deallocate((void*)ptrs(j), size_alloc);
+          pool.deallocate(reinterpret_cast<void*>(ptrs(j)), size_alloc);
 
-          ptrs(j) = (uintptr_t)pool.allocate(size_alloc);
+          ptrs(j) = reinterpret_cast<uintptr_t>(pool.allocate(size_alloc));
 
           if (0 == ptrs(j)) update++;
         }
@@ -266,7 +266,7 @@ int main(int argc, char* argv[]) {
     TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc,
                         fill_stride, chunk_span, repeat_inner);
 
-    Kokkos::Impl::Timer timer;
+    Kokkos::Timer timer;
 
     if (!functor.test_fill()) {
       Kokkos::abort("fill ");
diff --git a/packages/kokkos/core/perf_test/test_taskdag.cpp b/packages/kokkos/core/perf_test/test_taskdag.cpp
index b2f936a955eca4a6d8a3c0eec928e01c5de66e51..49957ae9323db825ebcfa6d3c19dddb38856862d 100644
--- a/packages/kokkos/core/perf_test/test_taskdag.cpp
+++ b/packages/kokkos/core/perf_test/test_taskdag.cpp
@@ -56,7 +56,7 @@ int main() { return 0; }
 #include <cstdlib>
 #include <limits>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 using ExecSpace = Kokkos::DefaultExecutionSpace;
 
@@ -220,7 +220,7 @@ int main(int argc, char* argv[]) {
     double time_sum = 0;
 
     for (int i = 0; i < test_repeat_outer; ++i) {
-      Kokkos::Impl::Timer timer;
+      Kokkos::Timer timer;
 
       Functor::FutureType ftmp =
           Kokkos::host_spawn(Kokkos::TaskSingle(sched), Functor(fib_input));
diff --git a/packages/kokkos/core/src/CMakeLists.txt b/packages/kokkos/core/src/CMakeLists.txt
index 2ab0989805723ce32115d379dd39708b5edd8209..499736c60d55b7746682f8828a9af45fc6c0aa8b 100644
--- a/packages/kokkos/core/src/CMakeLists.txt
+++ b/packages/kokkos/core/src/CMakeLists.txt
@@ -9,6 +9,8 @@ INSTALL (DIRECTORY
   "${CMAKE_CURRENT_SOURCE_DIR}/"
   DESTINATION ${KOKKOS_HEADER_DIR}
   FILES_MATCHING
+  PATTERN "*.inc"
+  PATTERN "*.inc_*"
   PATTERN "*.hpp"
   PATTERN "*.h"
 )
@@ -65,6 +67,15 @@ IF (KOKKOS_ENABLE_SYCL)
   APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.hpp)
 ENDIF()
 
+IF (KOKKOS_ENABLE_IMPL_DESUL_ATOMICS)
+  APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/desul/src/*.cpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*.hpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*/*.hpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*/*/*.hpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*/*/*.inc)
+ENDIF()
+
+
 KOKKOS_ADD_LIBRARY(
   kokkoscore
   SOURCES ${KOKKOS_CORE_SRCS}
@@ -86,3 +97,15 @@ KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBDL)
 KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBRT)
 KOKKOS_LINK_TPL(kokkoscore PUBLIC PTHREAD)
 KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM)
+
+# FIXME: We need a proper solution to figure out whether to enable
+#        libatomic
+# XL requires libatomic even for 64 bit CAS, most others only for 128
+# I (CT) had removed 128bit CAS from desul to not need libatomic.
+IF (KOKKOS_ENABLE_IMPL_DESUL_ATOMICS AND
+    (KOKKOS_ENABLE_OPENMPTARGET OR (CMAKE_CXX_COMPILER_ID STREQUAL XLClang)))
+  target_link_libraries(kokkoscore PUBLIC atomic)
+ENDIF()
+
+
+KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBQUADMATH)
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
index 916f109758de4ba3cf469659d7458ae77cf464da..f6b276240316ec1e6edc332d680eb72853c980b1 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@@ -90,43 +90,25 @@ static std::atomic<int> num_uvm_allocations(0);
 
 }  // namespace
 
-DeepCopy<CudaSpace, CudaSpace, Cuda>::DeepCopy(void *dst, const void *src,
-                                               size_t n) {
-  CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
+void DeepCopyCuda(void *dst, const void *src, size_t n) {
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
 }
 
-DeepCopy<HostSpace, CudaSpace, Cuda>::DeepCopy(void *dst, const void *src,
-                                               size_t n) {
-  CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
-}
-
-DeepCopy<CudaSpace, HostSpace, Cuda>::DeepCopy(void *dst, const void *src,
-                                               size_t n) {
-  CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
-}
-
-DeepCopy<CudaSpace, CudaSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst,
-                                               const void *src, size_t n) {
-  CUDA_SAFE_CALL(
-      cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream()));
-}
-
-DeepCopy<HostSpace, CudaSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst,
-                                               const void *src, size_t n) {
-  CUDA_SAFE_CALL(
-      cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream()));
-}
-
-DeepCopy<CudaSpace, HostSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst,
-                                               const void *src, size_t n) {
-  CUDA_SAFE_CALL(
+void DeepCopyAsyncCuda(const Cuda &instance, void *dst, const void *src,
+                       size_t n) {
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
       cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream()));
 }
 
 void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) {
   cudaStream_t s = cuda_get_deep_copy_stream();
-  CUDA_SAFE_CALL(cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, s));
-  cudaStreamSynchronize(s);
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
+      cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, s));
+  Impl::cuda_stream_synchronize(
+      s,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          DeepCopyResourceSynchronization,
+      "Kokkos::Impl::DeepCopyAsyncCuda: Deep Copy Stream Sync");
 }
 
 }  // namespace Impl
@@ -137,6 +119,7 @@ void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) {
 
 namespace Kokkos {
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 KOKKOS_DEPRECATED void CudaSpace::access_error() {
   const std::string msg(
       "Kokkos::CudaSpace::access_error attempt to execute Cuda function from "
@@ -150,6 +133,7 @@ KOKKOS_DEPRECATED void CudaSpace::access_error(const void *const) {
       "non-Cuda space");
   Kokkos::Impl::throw_runtime_exception(msg);
 }
+#endif
 
 /*--------------------------------------------------------------------------*/
 
@@ -164,9 +148,11 @@ bool CudaUVMSpace::available() {
 
 /*--------------------------------------------------------------------------*/
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 int CudaUVMSpace::number_of_allocations() {
   return Kokkos::Impl::num_uvm_allocations.load();
 }
+#endif
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
 // The purpose of the following variable is to allow a state-based choice
 // for pinning UVM allocations to the CPU. For now this is considered
@@ -204,6 +190,8 @@ CudaUVMSpace::CudaUVMSpace() : m_device(Kokkos::Cuda().cuda_device()) {}
 
 CudaHostPinnedSpace::CudaHostPinnedSpace() {}
 
+int memory_threshold_g = 40000;  // 40 kB
+
 //==============================================================================
 // <editor-fold desc="allocate()"> {{{1
 
@@ -221,7 +209,19 @@ void *CudaSpace::impl_allocate(
     const Kokkos::Tools::SpaceHandle arg_handle) const {
   void *ptr = nullptr;
 
+#ifndef CUDART_VERSION
+#error CUDART_VERSION undefined!
+#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
+  cudaError_t error_code;
+  if (arg_alloc_size >= memory_threshold_g) {
+    error_code = cudaMallocAsync(&ptr, arg_alloc_size, 0);
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  } else {
+    error_code = cudaMalloc(&ptr, arg_alloc_size);
+  }
+#else
   auto error_code = cudaMalloc(&ptr, arg_alloc_size);
+#endif
   if (error_code != cudaSuccess) {  // TODO tag as unlikely branch
     cudaGetLastError();  // This is the only way to clear the last error, which
                          // we should do here since we're turning it into an
@@ -253,7 +253,8 @@ void *CudaUVMSpace::impl_allocate(
     const Kokkos::Tools::SpaceHandle arg_handle) const {
   void *ptr = nullptr;
 
-  Cuda::impl_static_fence();
+  Cuda::impl_static_fence(
+      "Kokkos::CudaUVMSpace::impl_allocate: Pre UVM Allocation");
   if (arg_alloc_size > 0) {
     Kokkos::Impl::num_uvm_allocations++;
 
@@ -276,7 +277,8 @@ void *CudaUVMSpace::impl_allocate(
               CudaMallocManaged);
     }
   }
-  Cuda::impl_static_fence();
+  Cuda::impl_static_fence(
+      "Kokkos::CudaUVMSpace::impl_allocate: Post UVM Allocation");
   if (Kokkos::Profiling::profileLibraryLoaded()) {
     const size_t reported_size =
         (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
@@ -337,9 +339,20 @@ void CudaSpace::impl_deallocate(
     Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
                                       reported_size);
   }
-
   try {
-    CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+#ifndef CUDART_VERSION
+#error CUDART_VERSION undefined!
+#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
+    if (arg_alloc_size >= memory_threshold_g) {
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeAsync(arg_alloc_ptr, 0));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    } else {
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+    }
+#else
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+#endif
   } catch (...) {
   }
 }
@@ -362,7 +375,8 @@ void CudaUVMSpace::impl_deallocate(
     ,
     const size_t arg_logical_size,
     const Kokkos::Tools::SpaceHandle arg_handle) const {
-  Cuda::impl_static_fence();
+  Cuda::impl_static_fence(
+      "Kokkos::CudaUVMSpace::impl_deallocate: Pre UVM Deallocation");
   if (Kokkos::Profiling::profileLibraryLoaded()) {
     const size_t reported_size =
         (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
@@ -372,11 +386,12 @@ void CudaUVMSpace::impl_deallocate(
   try {
     if (arg_alloc_ptr != nullptr) {
       Kokkos::Impl::num_uvm_allocations--;
-      CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
     }
   } catch (...) {
   }
-  Cuda::impl_static_fence();
+  Cuda::impl_static_fence(
+      "Kokkos::CudaUVMSpace::impl_deallocate: Post UVM Deallocation");
 }
 
 void CudaHostPinnedSpace::deallocate(void *const arg_alloc_ptr,
@@ -401,7 +416,7 @@ void CudaHostPinnedSpace::impl_deallocate(
                                       reported_size);
   }
   try {
-    CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr));
   } catch (...) {
   }
 }
@@ -462,7 +477,7 @@ SharedAllocationRecord<Kokkos::CudaSpace, void>::attach_texture_object(
   resDesc.res.linear.sizeInBytes = alloc_size;
   resDesc.res.linear.devPtr      = alloc_ptr;
 
-  CUDA_SAFE_CALL(
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
       cudaCreateTextureObject(&tex_obj, &resDesc, &texDesc, nullptr));
 
   return tex_obj;
@@ -581,7 +596,7 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,
                            bool to_device) {
   if ((ptr == nullptr) || (bytes == 0)) return;
   cudaPointerAttributes attr;
-  CUDA_SAFE_CALL(cudaPointerGetAttributes(&attr, ptr));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaPointerGetAttributes(&attr, ptr));
   // I measured this and it turns out prefetching towards the host slows
   // DualView syncs down. Probably because the latency is not too bad in the
   // first place for the pull down. If we want to change that provde
@@ -593,8 +608,8 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,
 #endif
   if (to_device && is_managed &&
       space.cuda_device_prop().concurrentManagedAccess) {
-    CUDA_SAFE_CALL(cudaMemPrefetchAsync(ptr, bytes, space.cuda_device(),
-                                        space.cuda_stream()));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemPrefetchAsync(
+        ptr, bytes, space.cuda_device(), space.cuda_stream()));
   }
 }
 
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
index 0f4259072d97f26c0032e674bdf60b9031fcee11..993c8d1bbadc4ebff2fcc9bdc905fca6bb37a9cf 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -134,7 +134,12 @@ inline int cuda_deduce_block_size(bool early_termination,
     }
 
     if (blocks_per_sm >= min_blocks_per_sm) {
-      if (threads_per_sm >= opt_threads_per_sm) {
+      // The logic prefers smaller block sizes over larger ones to
+      // give more flexibility to the scheduler.
+      // But don't go below 128 where performance suffers significantly
+      // for simple copy/set kernels.
+      if ((threads_per_sm > opt_threads_per_sm) ||
+          ((block_size >= 128) && (threads_per_sm == opt_threads_per_sm))) {
         opt_block_size     = block_size;
         opt_threads_per_sm = threads_per_sm;
       }
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
index 4759001d81f99afc0a1e2aa6cf64462d9e7fcdc9..36df0d2564ae8ab86849837cf60cb6d93727aab2 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
@@ -49,13 +49,19 @@
 #ifdef KOKKOS_ENABLE_CUDA
 
 #include <impl/Kokkos_Error.hpp>
-
+#include <impl/Kokkos_Profiling.hpp>
 #include <iosfwd>
 
 namespace Kokkos {
 namespace Impl {
 
-void cuda_device_synchronize();
+void cuda_stream_synchronize(
+    const cudaStream_t stream,
+    Kokkos::Tools::Experimental::SpecialSynchronizationCases reason,
+    const std::string& name);
+void cuda_device_synchronize(const std::string& name);
+void cuda_stream_synchronize(const cudaStream_t stream,
+                             const std::string& name);
 
 void cuda_internal_error_throw(cudaError e, const char* name,
                                const char* file = nullptr, const int line = 0);
@@ -68,9 +74,24 @@ inline void cuda_internal_safe_call(cudaError e, const char* name,
   }
 }
 
-#define CUDA_SAFE_CALL(call) \
+#define KOKKOS_IMPL_CUDA_SAFE_CALL(call) \
   Kokkos::Impl::cuda_internal_safe_call(call, #call, __FILE__, __LINE__)
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+
+KOKKOS_DEPRECATED
+inline void cuda_internal_safe_call_deprecated(cudaError e, const char* name,
+                                               const char* file = nullptr,
+                                               const int line   = 0) {
+  cuda_internal_safe_call(e, name, file, line);
+}
+
+#define CUDA_SAFE_CALL(call)                                              \
+  Kokkos::Impl::cuda_internal_safe_call_deprecated(call, #call, __FILE__, \
+                                                   __LINE__)
+
+#endif
+
 }  // namespace Impl
 
 namespace Experimental {
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp
index 3de7a69916130de41077bae684df0cbc87daea4b..bd514f5e88d915b46eccc6ccd5da7baa311088e3 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp
@@ -60,6 +60,7 @@
 
 #include <Kokkos_Cuda.hpp>
 #include <cuda_runtime_api.h>
+#include <Cuda/Kokkos_Cuda_Error.hpp>
 
 namespace Kokkos {
 namespace Impl {
@@ -82,8 +83,8 @@ struct GraphImpl<Kokkos::Cuda> {
     constexpr size_t error_log_size = 256;
     cudaGraphNode_t error_node      = nullptr;
     char error_log[error_log_size];
-    CUDA_SAFE_CALL(cudaGraphInstantiate(&m_graph_exec, m_graph, &error_node,
-                                        error_log, error_log_size));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphInstantiate(
+        &m_graph_exec, m_graph, &error_node, error_log, error_log_size));
     // TODO @graphs print out errors
   }
 
@@ -107,26 +108,27 @@ struct GraphImpl<Kokkos::Cuda> {
     // TODO @graphs we need to somehow indicate the need for a fence in the
     //              destructor of the GraphImpl object (so that we don't have to
     //              just always do it)
-    m_execution_space.fence();
+    m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction");
     KOKKOS_EXPECTS(bool(m_graph))
     if (bool(m_graph_exec)) {
-      CUDA_SAFE_CALL(cudaGraphExecDestroy(m_graph_exec));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphExecDestroy(m_graph_exec));
     }
-    CUDA_SAFE_CALL(cudaGraphDestroy(m_graph));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphDestroy(m_graph));
   };
 
   explicit GraphImpl(Kokkos::Cuda arg_instance)
       : m_execution_space(std::move(arg_instance)) {
-    CUDA_SAFE_CALL(cudaGraphCreate(&m_graph, cuda_graph_flags_t{0}));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaGraphCreate(&m_graph, cuda_graph_flags_t{0}));
   }
 
   void add_node(std::shared_ptr<aggregate_node_impl_t> const& arg_node_ptr) {
     // All of the predecessors are just added as normal, so all we need to
     // do here is add an empty node
-    CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&(arg_node_ptr->node_details_t::node),
-                                         m_graph,
-                                         /* dependencies = */ nullptr,
-                                         /* numDependencies = */ 0));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaGraphAddEmptyNode(&(arg_node_ptr->node_details_t::node), m_graph,
+                              /* dependencies = */ nullptr,
+                              /* numDependencies = */ 0));
   }
 
   template <class NodeImpl>
@@ -171,7 +173,7 @@ struct GraphImpl<Kokkos::Cuda> {
     auto /*const*/& cuda_node = arg_node_ptr->node_details_t::node;
     KOKKOS_EXPECTS(bool(cuda_node))
 
-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
         cudaGraphAddDependencies(m_graph, &pred_cuda_node, &cuda_node, 1));
   }
 
@@ -179,7 +181,7 @@ struct GraphImpl<Kokkos::Cuda> {
     if (!bool(m_graph_exec)) {
       _instantiate_graph();
     }
-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
         cudaGraphLaunch(m_graph_exec, m_execution_space.cuda_stream()));
   }
 
@@ -192,9 +194,10 @@ struct GraphImpl<Kokkos::Cuda> {
     KOKKOS_EXPECTS(!bool(m_graph_exec))
     auto rv = std::make_shared<root_node_impl_t>(
         get_execution_space(), _graph_node_is_root_ctor_tag{});
-    CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&(rv->node_details_t::node), m_graph,
-                                         /* dependencies = */ nullptr,
-                                         /* numDependencies = */ 0));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaGraphAddEmptyNode(&(rv->node_details_t::node), m_graph,
+                              /* dependencies = */ nullptr,
+                              /* numDependencies = */ 0));
     KOKKOS_ENSURES(bool(rv->node_details_t::node))
     return rv;
   }
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp
index ec9c434fe663900a5d5029896a5c98ce13266605..c81286eb1004b10219b64f38563bc3e8af257ae9 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp
@@ -51,6 +51,9 @@
     !(defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL50) ||  \
       defined(KOKKOS_ARCH_MAXWELL52))
 #include <cuda_fp16.h>
+#include <iosfwd>  // istream & ostream for extraction and insertion ops
+#include <string>
+#include <Kokkos_NumericTraits.hpp>  // reduction_identity
 
 #ifndef KOKKOS_IMPL_HALF_TYPE_DEFINED
 // Make sure no one else tries to define half_t
@@ -127,7 +130,7 @@ KOKKOS_INLINE_FUNCTION
     std::enable_if_t<std::is_same<T, unsigned long long>::value, T>
         cast_from_half(half_t);
 
-class half_t {
+class alignas(2) half_t {
  public:
   using impl_type = Kokkos::Impl::half_impl_t::type;
 
@@ -138,6 +141,22 @@ class half_t {
   KOKKOS_FUNCTION
   half_t() : val(0.0F) {}
 
+  // Copy constructors
+  KOKKOS_DEFAULTED_FUNCTION
+  half_t(const half_t&) noexcept = default;
+
+  KOKKOS_INLINE_FUNCTION
+  half_t(const volatile half_t& rhs) {
+#ifdef __CUDA_ARCH__
+    val = rhs.val;
+#else
+    const volatile uint16_t* rv_ptr =
+        reinterpret_cast<const volatile uint16_t*>(&rhs.val);
+    const uint16_t rv_val = *rv_ptr;
+    val                   = reinterpret_cast<const impl_type&>(rv_val);
+#endif  // __CUDA_ARCH__
+  }
+
   // Don't support implicit conversion back to impl_type.
   // impl_type is a storage only type on host.
   KOKKOS_FUNCTION
@@ -219,7 +238,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     tmp.val = +tmp.val;
 #else
-    tmp.val   = __float2half(+__half2float(tmp.val));
+    tmp.val               = __float2half(+__half2float(tmp.val));
 #endif
     return tmp;
   }
@@ -230,7 +249,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     tmp.val = -tmp.val;
 #else
-    tmp.val   = __float2half(-__half2float(tmp.val));
+    tmp.val               = __float2half(-__half2float(tmp.val));
 #endif
     return tmp;
   }
@@ -241,7 +260,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     ++val;
 #else
-    float tmp = __half2float(val);
+    float tmp             = __half2float(val);
     ++tmp;
     val       = __float2half(tmp);
 #endif
@@ -255,7 +274,7 @@ class half_t {
 #else
     float tmp = __half2float(val);
     --tmp;
-    val = __float2half(tmp);
+    val     = __float2half(tmp);
 #endif
     return *this;
   }
@@ -290,7 +309,10 @@ class half_t {
 
   template <class T>
   KOKKOS_FUNCTION void operator=(T rhs) volatile {
-    val = cast_to_half(rhs).val;
+    impl_type new_val = cast_to_half(rhs).val;
+    volatile uint16_t* val_ptr =
+        reinterpret_cast<volatile uint16_t*>(const_cast<impl_type*>(&val));
+    *val_ptr = reinterpret_cast<uint16_t&>(new_val);
   }
 
   // Compound operators
@@ -299,30 +321,21 @@ class half_t {
 #ifdef __CUDA_ARCH__
     val += rhs.val;
 #else
-    val = __float2half(__half2float(val) + __half2float(rhs.val));
+    val     = __float2half(__half2float(val) + __half2float(rhs.val));
 #endif
     return *this;
   }
 
   KOKKOS_FUNCTION
-  volatile half_t& operator+=(half_t rhs) volatile {
-#ifdef __CUDA_ARCH__
-    // Cuda 10 supports __half volatile stores but not volatile arithmetic
-    // operands. Cast away volatile-ness of val for arithmetic but not for store
-    // location.
-    val = const_cast<impl_type&>(val) + rhs.val;
-#else
-    // Use non-volatile val_ref to suppress:
-    // "warning: implicit dereference will not access object of type ‘volatile
-    // __half’ in statement"
-    auto val_ref = const_cast<impl_type&>(val);
-    val_ref      = __float2half(__half2float(const_cast<impl_type&>(val)) +
-                           __half2float(rhs.val));
-#endif
-    return *this;
+  void operator+=(const volatile half_t& rhs) volatile {
+    half_t tmp_rhs = rhs;
+    half_t tmp_lhs = *this;
+
+    tmp_lhs += tmp_rhs;
+    *this = tmp_lhs;
   }
 
-  // Compund operators: upcast overloads for +=
+  // Compound operators: upcast overloads for +=
   template <class T>
   KOKKOS_FUNCTION std::enable_if_t<
       std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend
@@ -350,27 +363,18 @@ class half_t {
 #ifdef __CUDA_ARCH__
     val -= rhs.val;
 #else
-    val          = __float2half(__half2float(val) - __half2float(rhs.val));
+    val     = __float2half(__half2float(val) - __half2float(rhs.val));
 #endif
     return *this;
   }
 
   KOKKOS_FUNCTION
-  volatile half_t& operator-=(half_t rhs) volatile {
-#ifdef __CUDA_ARCH__
-    // Cuda 10 supports __half volatile stores but not volatile arithmetic
-    // operands. Cast away volatile-ness of val for arithmetic but not for store
-    // location.
-    val = const_cast<impl_type&>(val) - rhs.val;
-#else
-    // Use non-volatile val_ref to suppress:
-    // "warning: implicit dereference will not access object of type ‘volatile
-    // __half’ in statement"
-    auto val_ref = const_cast<impl_type&>(val);
-    val_ref      = __float2half(__half2float(const_cast<impl_type&>(val)) -
-                           __half2float(rhs.val));
-#endif
-    return *this;
+  void operator-=(const volatile half_t& rhs) volatile {
+    half_t tmp_rhs = rhs;
+    half_t tmp_lhs = *this;
+
+    tmp_lhs -= tmp_rhs;
+    *this = tmp_lhs;
   }
 
   // Compund operators: upcast overloads for -=
@@ -401,27 +405,18 @@ class half_t {
 #ifdef __CUDA_ARCH__
     val *= rhs.val;
 #else
-    val          = __float2half(__half2float(val) * __half2float(rhs.val));
+    val     = __float2half(__half2float(val) * __half2float(rhs.val));
 #endif
     return *this;
   }
 
   KOKKOS_FUNCTION
-  volatile half_t& operator*=(half_t rhs) volatile {
-#ifdef __CUDA_ARCH__
-    // Cuda 10 supports __half volatile stores but not volatile arithmetic
-    // operands. Cast away volatile-ness of val for arithmetic but not for store
-    // location.
-    val = const_cast<impl_type&>(val) * rhs.val;
-#else
-    // Use non-volatile val_ref to suppress:
-    // "warning: implicit dereference will not access object of type ‘volatile
-    // __half’ in statement"
-    auto val_ref = const_cast<impl_type&>(val);
-    val_ref      = __float2half(__half2float(const_cast<impl_type&>(val)) *
-                           __half2float(rhs.val));
-#endif
-    return *this;
+  void operator*=(const volatile half_t& rhs) volatile {
+    half_t tmp_rhs = rhs;
+    half_t tmp_lhs = *this;
+
+    tmp_lhs *= tmp_rhs;
+    *this = tmp_lhs;
   }
 
   // Compund operators: upcast overloads for *=
@@ -452,27 +447,18 @@ class half_t {
 #ifdef __CUDA_ARCH__
     val /= rhs.val;
 #else
-    val          = __float2half(__half2float(val) / __half2float(rhs.val));
+    val     = __float2half(__half2float(val) / __half2float(rhs.val));
 #endif
     return *this;
   }
 
   KOKKOS_FUNCTION
-  volatile half_t& operator/=(half_t rhs) volatile {
-#ifdef __CUDA_ARCH__
-    // Cuda 10 supports __half volatile stores but not volatile arithmetic
-    // operands. Cast away volatile-ness of val for arithmetic but not for store
-    // location.
-    val = const_cast<impl_type&>(val) / rhs.val;
-#else
-    // Use non-volatile val_ref to suppress:
-    // "warning: implicit dereference will not access object of type ‘volatile
-    // __half’ in statement"
-    auto val_ref = const_cast<impl_type&>(val);
-    val_ref      = __float2half(__half2float(const_cast<impl_type&>(val)) /
-                           __half2float(rhs.val));
-#endif
-    return *this;
+  void operator/=(const volatile half_t& rhs) volatile {
+    half_t tmp_rhs = rhs;
+    half_t tmp_lhs = *this;
+
+    tmp_lhs /= tmp_rhs;
+    *this = tmp_lhs;
   }
 
   // Compund operators: upcast overloads for /=
@@ -504,7 +490,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     lhs.val += rhs.val;
 #else
-    lhs.val      = __float2half(__half2float(lhs.val) + __half2float(rhs.val));
+    lhs.val = __float2half(__half2float(lhs.val) + __half2float(rhs.val));
 #endif
     return lhs;
   }
@@ -529,7 +515,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     lhs.val -= rhs.val;
 #else
-    lhs.val      = __float2half(__half2float(lhs.val) - __half2float(rhs.val));
+    lhs.val = __float2half(__half2float(lhs.val) - __half2float(rhs.val));
 #endif
     return lhs;
   }
@@ -554,7 +540,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     lhs.val *= rhs.val;
 #else
-    lhs.val      = __float2half(__half2float(lhs.val) * __half2float(rhs.val));
+    lhs.val = __float2half(__half2float(lhs.val) * __half2float(rhs.val));
 #endif
     return lhs;
   }
@@ -579,7 +565,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     lhs.val /= rhs.val;
 #else
-    lhs.val      = __float2half(__half2float(lhs.val) / __half2float(rhs.val));
+    lhs.val = __float2half(__half2float(lhs.val) / __half2float(rhs.val));
 #endif
     return lhs;
   }
@@ -683,6 +669,62 @@ class half_t {
     return __half2float(val) >= __half2float(rhs.val);
 #endif
   }
+
+  KOKKOS_FUNCTION
+  friend bool operator==(const volatile half_t& lhs,
+                         const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs == tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator!=(const volatile half_t& lhs,
+                         const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs != tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator<(const volatile half_t& lhs,
+                        const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs < tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator>(const volatile half_t& lhs,
+                        const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs > tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator<=(const volatile half_t& lhs,
+                         const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs <= tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator>=(const volatile half_t& lhs,
+                         const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs >= tmp_rhs;
+  }
+
+  // Insertion and extraction operators
+  friend std::ostream& operator<<(std::ostream& os, const half_t& x) {
+    const std::string out = std::to_string(static_cast<double>(x));
+    os << out;
+    return os;
+  }
+
+  friend std::istream& operator>>(std::istream& is, half_t& x) {
+    std::string in;
+    is >> in;
+    x = std::stod(in);
+    return is;
+  }
 };
 
 // CUDA before 11.1 only has the half <-> float conversions marked host device
@@ -943,6 +985,25 @@ KOKKOS_INLINE_FUNCTION
 }
 #endif
 }  // namespace Experimental
+
+// use float as the return type for sum and prod since cuda_fp16.h
+// has no constexpr functions for casting to __half
+template <>
+struct reduction_identity<Kokkos::Experimental::half_t> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum() noexcept {
+    return 0.0F;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() noexcept {
+    return 1.0F;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() noexcept {
+    return -65504.0F;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float min() noexcept {
+    return 65504.0F;
+  }
+};
+
 }  // namespace Kokkos
 #endif  // KOKKOS_IMPL_HALF_TYPE_DEFINED
 #endif  // KOKKOS_ENABLE_CUDA
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
index 016cb6cdcbdd37740613724bb99efb9b4c32d7d4..6964d5b41b72368e9b4305d37e156e08321f7814 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -119,7 +119,7 @@ int cuda_kernel_arch() {
   int arch    = 0;
   int *d_arch = nullptr;
 
-  cudaMalloc((void **)&d_arch, sizeof(int));
+  cudaMalloc(reinterpret_cast<void **>(&d_arch), sizeof(int));
   cudaMemcpy(d_arch, &arch, sizeof(int), cudaMemcpyDefault);
 
   query_cuda_kernel_arch<<<1, 1>>>(d_arch);
@@ -141,7 +141,36 @@ bool cuda_launch_blocking() {
 
 }  // namespace
 
-void cuda_device_synchronize() { CUDA_SAFE_CALL(cudaDeviceSynchronize()); }
+void cuda_device_synchronize(const std::string &name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Cuda>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      []() {  // TODO: correct device ID
+        KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+      });
+}
+
+void cuda_stream_synchronize(const cudaStream_t stream, const CudaInternal *ptr,
+                             const std::string &name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Cuda>(
+      name,
+      Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{
+          ptr->impl_get_instance_id()},
+      [&]() {  // TODO: correct device ID
+        KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
+      });
+}
+
+void cuda_stream_synchronize(
+    const cudaStream_t stream,
+    Kokkos::Tools::Experimental::SpecialSynchronizationCases reason,
+    const std::string &name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Cuda>(
+      name, reason, [&]() {  // TODO: correct device ID
+        KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
+      });
+}
 
 void cuda_internal_error_throw(cudaError e, const char *name, const char *file,
                                const int line) {
@@ -221,7 +250,7 @@ CudaInternalDevices::CudaInternalDevices() {
   // See 'cudaSetDeviceFlags' for host-device thread interaction
   // Section 4.4.2.6 of the CUDA Toolkit Reference Manual
 
-  CUDA_SAFE_CALL(cudaGetDeviceCount(&m_cudaDevCount));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&m_cudaDevCount));
 
   if (m_cudaDevCount > MAXIMUM_DEVICE_COUNT) {
     Kokkos::abort(
@@ -229,7 +258,7 @@ CudaInternalDevices::CudaInternalDevices() {
         "have. Please report this to github.com/kokkos/kokkos.");
   }
   for (int i = 0; i < m_cudaDevCount; ++i) {
-    CUDA_SAFE_CALL(cudaGetDeviceProperties(m_cudaProp + i, i));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(m_cudaProp + i, i));
   }
 }
 
@@ -277,25 +306,27 @@ CudaInternal::~CudaInternal() {
               << std::endl;
   }
 
-  m_cudaDev                   = -1;
-  m_cudaArch                  = -1;
-  m_multiProcCount            = 0;
-  m_maxWarpCount              = 0;
-  m_maxBlock                  = 0;
-  m_maxSharedWords            = 0;
-  m_maxConcurrency            = 0;
-  m_scratchSpaceCount         = 0;
-  m_scratchFlagsCount         = 0;
-  m_scratchUnifiedCount       = 0;
-  m_scratchUnifiedSupported   = 0;
-  m_streamCount               = 0;
-  m_scratchSpace              = nullptr;
-  m_scratchFlags              = nullptr;
-  m_scratchUnified            = nullptr;
-  m_scratchConcurrentBitset   = nullptr;
-  m_stream                    = nullptr;
-  m_team_scratch_current_size = 0;
-  m_team_scratch_ptr          = nullptr;
+  m_cudaDev                 = -1;
+  m_cudaArch                = -1;
+  m_multiProcCount          = 0;
+  m_maxWarpCount            = 0;
+  m_maxBlock                = 0;
+  m_maxSharedWords          = 0;
+  m_maxConcurrency          = 0;
+  m_scratchSpaceCount       = 0;
+  m_scratchFlagsCount       = 0;
+  m_scratchUnifiedCount     = 0;
+  m_scratchUnifiedSupported = 0;
+  m_streamCount             = 0;
+  m_scratchSpace            = nullptr;
+  m_scratchFlags            = nullptr;
+  m_scratchUnified          = nullptr;
+  m_scratchConcurrentBitset = nullptr;
+  m_stream                  = nullptr;
+  for (int i = 0; i < m_n_team_scratch; ++i) {
+    m_team_scratch_current_size[i] = 0;
+    m_team_scratch_ptr[i]          = nullptr;
+  }
 }
 
 int CudaInternal::verify_is_initialized(const char *const label) const {
@@ -305,16 +336,20 @@ int CudaInternal::verify_is_initialized(const char *const label) const {
   }
   return 0 <= m_cudaDev;
 }
-
+uint32_t CudaInternal::impl_get_instance_id() const { return m_instance_id; }
 CudaInternal &CudaInternal::singleton() {
   static CudaInternal self;
   return self;
 }
+void CudaInternal::fence(const std::string &name) const {
+  Impl::cuda_stream_synchronize(m_stream, this, name);
+}
 void CudaInternal::fence() const {
-  CUDA_SAFE_CALL(cudaStreamSynchronize(m_stream));
+  fence("Kokkos::CudaInternal::fence(): Unnamed Instance Fence");
 }
 
-void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
+void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream,
+                              bool manage_stream) {
   if (was_finalized)
     Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n");
   was_initialized = true;
@@ -350,8 +385,9 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
     m_cudaDev    = cuda_device_id;
     m_deviceProp = cudaProp;
 
-    CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev));
-    Kokkos::Impl::cuda_device_synchronize();
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev));
+    Kokkos::Impl::cuda_device_synchronize(
+        "Kokkos::CudaInternal::initialize: Fence on space initialization");
 
     // Query what compute capability architecture a kernel executes:
     m_cudaArch = cuda_kernel_arch();
@@ -464,8 +500,8 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
 
       m_scratchConcurrentBitset = reinterpret_cast<uint32_t *>(r->data());
 
-      CUDA_SAFE_CALL(cudaMemset(m_scratchConcurrentBitset, 0,
-                                sizeof(uint32_t) * buffer_bound));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemset(m_scratchConcurrentBitset, 0,
+                                            sizeof(uint32_t) * buffer_bound));
     }
     //----------------------------------
 
@@ -535,15 +571,19 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
   // Allocate a staging buffer for constant mem in pinned host memory
   // and an event to avoid overwriting driver for previous kernel launches
   if (stream == nullptr) {
-    CUDA_SAFE_CALL(cudaMallocHost((void **)&constantMemHostStaging,
-                                  CudaTraits::ConstantMemoryUsage));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaMallocHost(reinterpret_cast<void **>(&constantMemHostStaging),
+                       CudaTraits::ConstantMemoryUsage));
 
-    CUDA_SAFE_CALL(cudaEventCreate(&constantMemReusable));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventCreate(&constantMemReusable));
   }
 
-  m_stream                    = stream;
-  m_team_scratch_current_size = 0;
-  m_team_scratch_ptr          = nullptr;
+  m_stream        = stream;
+  m_manage_stream = manage_stream;
+  for (int i = 0; i < m_n_team_scratch; ++i) {
+    m_team_scratch_current_size[i] = 0;
+    m_team_scratch_ptr[i]          = nullptr;
+  }
 }
 
 //----------------------------------------------------------------------------
@@ -569,7 +609,7 @@ Cuda::size_type *CudaInternal::scratch_flags(const Cuda::size_type size) const {
 
     m_scratchFlags = reinterpret_cast<size_type *>(r->data());
 
-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
         cudaMemset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain));
   }
 
@@ -645,20 +685,37 @@ Cuda::size_type *CudaInternal::scratch_functor(
   return m_scratchFunctor;
 }
 
-void *CudaInternal::resize_team_scratch_space(std::int64_t bytes,
-                                              bool force_shrink) {
-  if (m_team_scratch_current_size == 0) {
-    m_team_scratch_current_size = bytes;
-    m_team_scratch_ptr          = Kokkos::kokkos_malloc<Kokkos::CudaSpace>(
-        "Kokkos::CudaSpace::TeamScratchMemory", m_team_scratch_current_size);
+std::pair<void *, int> CudaInternal::resize_team_scratch_space(
+    std::int64_t bytes, bool force_shrink) {
+  // Multiple ParallelFor/Reduce Teams can call this function at the same time
+  // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race
+  // condition.
+
+  int current_team_scratch = 0;
+  int zero                 = 0;
+  int one                  = 1;
+  while (m_team_scratch_pool[current_team_scratch].compare_exchange_weak(
+      zero, one, std::memory_order_release, std::memory_order_relaxed)) {
+    current_team_scratch = (current_team_scratch + 1) % m_n_team_scratch;
   }
-  if ((bytes > m_team_scratch_current_size) ||
-      ((bytes < m_team_scratch_current_size) && (force_shrink))) {
-    m_team_scratch_current_size = bytes;
-    m_team_scratch_ptr          = Kokkos::kokkos_realloc<Kokkos::CudaSpace>(
-        m_team_scratch_ptr, m_team_scratch_current_size);
+  if (m_team_scratch_current_size[current_team_scratch] == 0) {
+    m_team_scratch_current_size[current_team_scratch] = bytes;
+    m_team_scratch_ptr[current_team_scratch] =
+        Kokkos::kokkos_malloc<Kokkos::CudaSpace>(
+            "Kokkos::CudaSpace::TeamScratchMemory",
+            m_team_scratch_current_size[current_team_scratch]);
   }
-  return m_team_scratch_ptr;
+  if ((bytes > m_team_scratch_current_size[current_team_scratch]) ||
+      ((bytes < m_team_scratch_current_size[current_team_scratch]) &&
+       (force_shrink))) {
+    m_team_scratch_current_size[current_team_scratch] = bytes;
+    m_team_scratch_ptr[current_team_scratch] =
+        Kokkos::kokkos_realloc<Kokkos::CudaSpace>(
+            m_team_scratch_ptr[current_team_scratch],
+            m_team_scratch_current_size[current_team_scratch]);
+  }
+  return std::make_pair(m_team_scratch_ptr[current_team_scratch],
+                        current_team_scratch);
 }
 
 //----------------------------------------------------------------------------
@@ -685,36 +742,43 @@ void CudaInternal::finalize() {
     if (m_scratchFunctorSize > 0)
       RecordCuda::decrement(RecordCuda::get_record(m_scratchFunctor));
 
-    if (m_team_scratch_current_size > 0)
-      Kokkos::kokkos_free<Kokkos::CudaSpace>(m_team_scratch_ptr);
-
-    m_cudaDev                   = -1;
-    m_multiProcCount            = 0;
-    m_maxWarpCount              = 0;
-    m_maxBlock                  = 0;
-    m_maxSharedWords            = 0;
-    m_scratchSpaceCount         = 0;
-    m_scratchFlagsCount         = 0;
-    m_scratchUnifiedCount       = 0;
-    m_streamCount               = 0;
-    m_scratchSpace              = nullptr;
-    m_scratchFlags              = nullptr;
-    m_scratchUnified            = nullptr;
-    m_scratchConcurrentBitset   = nullptr;
-    m_stream                    = nullptr;
-    m_team_scratch_current_size = 0;
-    m_team_scratch_ptr          = nullptr;
+    for (int i = 0; i < m_n_team_scratch; ++i) {
+      if (m_team_scratch_current_size[i] > 0)
+        Kokkos::kokkos_free<Kokkos::CudaSpace>(m_team_scratch_ptr[i]);
+    }
+
+    if (m_manage_stream && m_stream != nullptr)
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(m_stream));
+
+    m_cudaDev                 = -1;
+    m_multiProcCount          = 0;
+    m_maxWarpCount            = 0;
+    m_maxBlock                = 0;
+    m_maxSharedWords          = 0;
+    m_scratchSpaceCount       = 0;
+    m_scratchFlagsCount       = 0;
+    m_scratchUnifiedCount     = 0;
+    m_streamCount             = 0;
+    m_scratchSpace            = nullptr;
+    m_scratchFlags            = nullptr;
+    m_scratchUnified          = nullptr;
+    m_scratchConcurrentBitset = nullptr;
+    m_stream                  = nullptr;
+    for (int i = 0; i < m_n_team_scratch; ++i) {
+      m_team_scratch_current_size[i] = 0;
+      m_team_scratch_ptr[i]          = nullptr;
+    }
   }
 
   // only destroy these if we're finalizing the singleton
   if (this == &singleton()) {
-    cudaFreeHost(constantMemHostStaging);
-    cudaEventDestroy(constantMemReusable);
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(constantMemHostStaging));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventDestroy(constantMemReusable));
     auto &deep_copy_space =
         Kokkos::Impl::cuda_get_deep_copy_space(/*initialize*/ false);
     if (deep_copy_space)
       deep_copy_space->impl_internal_space_instance()->finalize();
-    cudaStreamDestroy(cuda_get_deep_copy_stream());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(cuda_get_deep_copy_stream()));
   }
 }
 
@@ -823,7 +887,7 @@ Cuda::Cuda()
       "Cuda instance constructor");
 }
 
-Cuda::Cuda(cudaStream_t stream)
+Cuda::Cuda(cudaStream_t stream, bool manage_stream)
     : m_space_instance(new Impl::CudaInternal, [](Impl::CudaInternal *ptr) {
         ptr->finalize();
         delete ptr;
@@ -831,18 +895,31 @@ Cuda::Cuda(cudaStream_t stream)
   Impl::CudaInternal::singleton().verify_is_initialized(
       "Cuda instance constructor");
   m_space_instance->initialize(Impl::CudaInternal::singleton().m_cudaDev,
-                               stream);
+                               stream, manage_stream);
 }
 
 void Cuda::print_configuration(std::ostream &s, const bool) {
   Impl::CudaInternal::singleton().print_configuration(s);
 }
 
-void Cuda::impl_static_fence() { Kokkos::Impl::cuda_device_synchronize(); }
+void Cuda::impl_static_fence(const std::string &name) {
+  Kokkos::Impl::cuda_device_synchronize(name);
+}
+void Cuda::impl_static_fence() {
+  impl_static_fence("Kokkos::Cuda::impl_static_fence(): Unnamed Static Fence");
+}
 
-void Cuda::fence() const { m_space_instance->fence(); }
+void Cuda::fence() const {
+  fence("Kokkos::Cuda::fence(): Unnamed Instance Fence");
+}
+void Cuda::fence(const std::string &name) const {
+  m_space_instance->fence(name);
+}
 
 const char *Cuda::name() { return "Cuda"; }
+uint32_t Cuda::impl_instance_id() const noexcept {
+  return m_space_instance->impl_get_instance_id();
+}
 
 cudaStream_t Cuda::cuda_stream() const { return m_space_instance->m_stream; }
 int Cuda::cuda_device() const { return m_space_instance->m_cudaDev; }
@@ -877,7 +954,15 @@ void CudaSpaceInitializer::finalize(bool all_spaces) {
   }
 }
 
-void CudaSpaceInitializer::fence() { Kokkos::Cuda::impl_static_fence(); }
+void CudaSpaceInitializer::fence() {
+  Kokkos::Cuda::impl_static_fence(
+      "Kokkos::CudaSpaceInitializer::fence: Initializer Fence");
+}
+void CudaSpaceInitializer::fence(const std::string &name) {
+  // Kokkos::Cuda::impl_static_fence("Kokkos::CudaSpaceInitializer::fence:
+  // "+name); //TODO: or this
+  Kokkos::Cuda::impl_static_fence(name);
+}
 
 void CudaSpaceInitializer::print_configuration(std::ostream &msg,
                                                const bool detail) {
@@ -916,12 +1001,6 @@ void CudaSpaceInitializer::print_configuration(std::ostream &msg,
   msg << "yes\n";
 #else
   msg << "no\n";
-#endif
-  msg << "  KOKKOS_ENABLE_CUSPARSE: ";
-#ifdef KOKKOS_ENABLE_CUSPARSE
-  msg << "yes\n";
-#else
-  msg << "no\n";
 #endif
   msg << "  KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: ";
 #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp
index aaec2c29260a5ad2b82e2daa653a58372253cd4d..7eb169838c05dc144e9789d4466f83d3febfe926 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp
@@ -3,6 +3,9 @@
 
 #include <vector>
 #include <impl/Kokkos_Tools.hpp>
+#include <atomic>
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 // These functions fulfill the purpose of allowing to work around
@@ -114,10 +117,14 @@ class CudaInternal {
   mutable size_type* m_scratchFunctor;
   uint32_t* m_scratchConcurrentBitset;
   cudaStream_t m_stream;
+  uint32_t m_instance_id;
+  bool m_manage_stream;
 
   // Team Scratch Level 1 Space
-  mutable int64_t m_team_scratch_current_size;
-  mutable void* m_team_scratch_ptr;
+  int m_n_team_scratch = 10;
+  mutable int64_t m_team_scratch_current_size[10];
+  mutable void* m_team_scratch_ptr[10];
+  mutable std::atomic_int m_team_scratch_pool[10];
 
   bool was_initialized = false;
   bool was_finalized   = false;
@@ -135,7 +142,8 @@ class CudaInternal {
     return nullptr != m_scratchSpace && nullptr != m_scratchFlags;
   }
 
-  void initialize(int cuda_device_id, cudaStream_t stream = nullptr);
+  void initialize(int cuda_device_id, cudaStream_t stream = nullptr,
+                  bool manage_stream = false);
   void finalize();
 
   void print_configuration(std::ostream&) const;
@@ -145,6 +153,7 @@ class CudaInternal {
   static void cuda_set_serial_execution(bool);
 #endif
 
+  void fence(const std::string&) const;
   void fence() const;
 
   ~CudaInternal();
@@ -175,20 +184,68 @@ class CudaInternal {
         m_scratchFunctor(nullptr),
         m_scratchConcurrentBitset(nullptr),
         m_stream(nullptr),
-        m_team_scratch_current_size(0),
-        m_team_scratch_ptr(nullptr) {}
+        m_instance_id(
+            Kokkos::Tools::Experimental::Impl::idForInstance<Kokkos::Cuda>(
+                reinterpret_cast<uintptr_t>(this))) {
+    for (int i = 0; i < m_n_team_scratch; ++i) {
+      m_team_scratch_current_size[i] = 0;
+      m_team_scratch_ptr[i]          = nullptr;
+      m_team_scratch_pool[i]         = 0;
+    }
+  }
 
   // Resizing of reduction related scratch spaces
   size_type* scratch_space(const size_type size) const;
   size_type* scratch_flags(const size_type size) const;
   size_type* scratch_unified(const size_type size) const;
   size_type* scratch_functor(const size_type size) const;
-
+  uint32_t impl_get_instance_id() const;
   // Resizing of team level 1 scratch
-  void* resize_team_scratch_space(std::int64_t bytes,
-                                  bool force_shrink = false);
+  std::pair<void*, int> resize_team_scratch_space(std::int64_t bytes,
+                                                  bool force_shrink = false);
 };
 
 }  // Namespace Impl
+
+namespace Experimental {
+// Partitioning an Execution Space: expects space and integer arguments for
+// relative weight
+//   Customization point for backends
+//   Default behavior is to return the passed in instance
+
+namespace Impl {
+inline void create_Cuda_instances(std::vector<Cuda>& instances) {
+  for (int s = 0; s < int(instances.size()); s++) {
+    cudaStream_t stream;
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&stream));
+    instances[s] = Cuda(stream, true);
+  }
+}
+}  // namespace Impl
+
+template <class... Args>
+std::vector<Cuda> partition_space(const Cuda&, Args...) {
+#ifdef __cpp_fold_expressions
+  static_assert(
+      (... && std::is_arithmetic_v<Args>),
+      "Kokkos Error: partitioning arguments must be integers or floats");
+#endif
+  std::vector<Cuda> instances(sizeof...(Args));
+  Impl::create_Cuda_instances(instances);
+  return instances;
+}
+
+template <class T>
+std::vector<Cuda> partition_space(const Cuda&, std::vector<T>& weights) {
+  static_assert(
+      std::is_arithmetic<T>::value,
+      "Kokkos Error: partitioning arguments must be integers or floats");
+
+  std::vector<Cuda> instances(weights.size());
+  Impl::create_Cuda_instances(instances);
+  return instances;
+}
+}  // namespace Experimental
+
 }  // Namespace Kokkos
 #endif
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
index d892a893b330772ec5e4306ed20a44f8aa2369f1..4b01798f5e2cad495c897b8110d96eec87fe429f 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -167,7 +167,7 @@ inline void configure_shmem_preference(KernelFuncPtr const& func,
 #ifndef KOKKOS_ARCH_KEPLER
   // On Kepler the L1 has no benefit since it doesn't cache reads
   auto set_cache_config = [&] {
-    CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
         func,
         (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
     return prefer_shmem;
@@ -372,14 +372,15 @@ struct CudaParallelLaunchKernelInvoker<
       params.kernelParams   = (void**)args;
       params.extra          = nullptr;
 
-      CUDA_SAFE_CALL(cudaGraphAddKernelNode(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphAddKernelNode(
           &graph_node, graph, /* dependencies = */ nullptr,
           /* numDependencies = */ 0, &params));
     } else {
       // We still need an empty node for the dependency structure
-      CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&graph_node, graph,
-                                           /* dependencies = */ nullptr,
-                                           /* numDependencies = */ 0));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          cudaGraphAddEmptyNode(&graph_node, graph,
+                                /* dependencies = */ nullptr,
+                                /* numDependencies = */ 0));
     }
     KOKKOS_ENSURES(bool(graph_node))
   }
@@ -475,14 +476,15 @@ struct CudaParallelLaunchKernelInvoker<
       params.kernelParams   = (void**)args;
       params.extra          = nullptr;
 
-      CUDA_SAFE_CALL(cudaGraphAddKernelNode(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphAddKernelNode(
           &graph_node, graph, /* dependencies = */ nullptr,
           /* numDependencies = */ 0, &params));
     } else {
       // We still need an empty node for the dependency structure
-      CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&graph_node, graph,
-                                           /* dependencies = */ nullptr,
-                                           /* numDependencies = */ 0));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          cudaGraphAddEmptyNode(&graph_node, graph,
+                                /* dependencies = */ nullptr,
+                                /* numDependencies = */ 0));
     }
     KOKKOS_ENSURES(bool(graph_node))
   }
@@ -538,7 +540,8 @@ struct CudaParallelLaunchKernelInvoker<
                             dim3 const& block, int shmem,
                             CudaInternal const* cuda_instance) {
     // Wait until the previous kernel that uses the constant buffer is done
-    CUDA_SAFE_CALL(cudaEventSynchronize(cuda_instance->constantMemReusable));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaEventSynchronize(cuda_instance->constantMemReusable));
 
     // Copy functor (synchronously) to staging buffer in pinned host memory
     unsigned long* staging = cuda_instance->constantMemHostStaging;
@@ -554,8 +557,9 @@ struct CudaParallelLaunchKernelInvoker<
          get_kernel_func())<<<grid, block, shmem, cuda_instance->m_stream>>>();
 
     // Record an event that says when the constant buffer can be reused
-    CUDA_SAFE_CALL(cudaEventRecord(cuda_instance->constantMemReusable,
-                                   cudaStream_t(cuda_instance->m_stream)));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaEventRecord(cuda_instance->constantMemReusable,
+                        cudaStream_t(cuda_instance->m_stream)));
   }
 
 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
@@ -637,8 +641,9 @@ struct CudaParallelLaunchImpl<
       base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance);
 
 #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
-      CUDA_SAFE_CALL(cudaGetLastError());
-      cuda_instance->fence();
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
+      cuda_instance->fence(
+          "Kokkos::Impl::launch_kernel: Debug Only Check for Execution Error");
 #endif
     }
   }
@@ -650,7 +655,7 @@ struct CudaParallelLaunchImpl<
     // the code and the result is visible.
     auto wrap_get_attributes = []() -> cudaFuncAttributes {
       cudaFuncAttributes attr_tmp;
-      CUDA_SAFE_CALL(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
           cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
       return attr_tmp;
     };
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
index ff31649544033b773519152ca25a22494fdd2f5f..1f3024f3186a14d847a6999b995832e7782b62e9 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
@@ -81,22 +81,34 @@ namespace Impl {
 CudaLockArrays g_host_cuda_lock_arrays = {nullptr, nullptr, 0};
 
 void initialize_host_cuda_lock_arrays() {
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+  desul::Impl::init_lock_arrays();
+
+  DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
+#endif
   if (g_host_cuda_lock_arrays.atomic != nullptr) return;
-  CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.atomic,
-                            sizeof(int) * (CUDA_SPACE_ATOMIC_MASK + 1)));
-  CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.scratch,
-                            sizeof(int) * (Cuda::concurrency())));
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
+      cudaMalloc(&g_host_cuda_lock_arrays.atomic,
+                 sizeof(int) * (CUDA_SPACE_ATOMIC_MASK + 1)));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.scratch,
+                                        sizeof(int) * (Cuda::concurrency())));
+  Impl::cuda_device_synchronize(
+      "Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays");
   g_host_cuda_lock_arrays.n = Cuda::concurrency();
   KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
   init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256,
                                   256>>>();
   init_lock_array_kernel_threadid<<<(Kokkos::Cuda::concurrency() + 255) / 256,
                                     256>>>(Kokkos::Cuda::concurrency());
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  Impl::cuda_device_synchronize(
+      "Kokkos::Impl::initialize_host_cuda_lock_arrays: Post Init Lock Arrays");
 }
 
 void finalize_host_cuda_lock_arrays() {
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+  desul::Impl::finalize_lock_arrays();
+#endif
+
   if (g_host_cuda_lock_arrays.atomic == nullptr) return;
   cudaFree(g_host_cuda_lock_arrays.atomic);
   g_host_cuda_lock_arrays.atomic = nullptr;
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
index 7640b8084d16a210408deb94a35f8962dfc92c99..04fb7cb345a27e9d9932d188216d5261d8606939 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
@@ -53,6 +53,10 @@
 
 #include <Cuda/Kokkos_Cuda_Error.hpp>
 
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#include <desul/atomics/Lock_Array_Cuda.hpp>
+#endif
+
 namespace Kokkos {
 namespace Impl {
 
@@ -150,13 +154,14 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
 }  // namespace
 }  // namespace Impl
 }  // namespace Kokkos
+
 /* Dan Ibanez: it is critical that this code be a macro, so that it will
    capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays!
    putting this in an inline function will NOT do the right thing! */
 #define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()                      \
   {                                                                   \
     if (::Kokkos::Impl::lock_array_copied == 0) {                     \
-      CUDA_SAFE_CALL(                                                 \
+      KOKKOS_IMPL_CUDA_SAFE_CALL(                                     \
           cudaMemcpyToSymbol(Kokkos::Impl::g_device_cuda_lock_arrays, \
                              &Kokkos::Impl::g_host_cuda_lock_arrays,  \
                              sizeof(Kokkos::Impl::CudaLockArrays)));  \
@@ -164,6 +169,8 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
     lock_array_copied = 1;                                            \
   }
 
+#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+
 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
 #define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
 #else
@@ -171,6 +178,19 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
   KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
 #endif
 
+#else
+
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
+#else
+// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
+  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()         \
+  DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
+#endif
+
+#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */
+
 #endif /* defined( KOKKOS_ENABLE_CUDA ) */
 
 #endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
index 2834e6f3de012b718ae06ebb6f87d7d24e3e5756..f83b43e608855492f9d4df725533a08184f5edaf 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
@@ -62,7 +62,6 @@
 #include <Cuda/Kokkos_Cuda_Locks.hpp>
 #include <Cuda/Kokkos_Cuda_Team.hpp>
 #include <Kokkos_Vectorization.hpp>
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
 
 #include <impl/Kokkos_Tools.hpp>
 #include <typeinfo>
@@ -240,9 +239,11 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
 
   //----------------------------------------
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   KOKKOS_DEPRECATED inline int vector_length() const {
     return impl_vector_length();
   }
+#endif
   inline int impl_vector_length() const { return m_vector_length; }
   inline int team_size() const { return m_team_size; }
   inline int league_size() const { return m_league_size; }
@@ -687,6 +688,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   int m_shmem_size;
   void* m_scratch_ptr[2];
   int m_scratch_size[2];
+  int m_scratch_pool_id = -1;
 
   template <class TagType>
   __device__ inline
@@ -797,15 +799,19 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // Functor's reduce memory, team scan memory, and team shared memory depend
     // upon team size.
     m_scratch_ptr[0] = nullptr;
-    m_scratch_ptr[1] =
-        m_team_size <= 0
-            ? nullptr
-            : m_policy.space()
-                  .impl_internal_space_instance()
-                  ->resize_team_scratch_space(
-                      static_cast<ptrdiff_t>(m_scratch_size[1]) *
-                      static_cast<ptrdiff_t>(Cuda::concurrency() /
-                                             (m_team_size * m_vector_size)));
+    if (m_team_size <= 0) {
+      m_scratch_ptr[1] = nullptr;
+    } else {
+      auto scratch_ptr_id =
+          m_policy.space()
+              .impl_internal_space_instance()
+              ->resize_team_scratch_space(
+                  static_cast<std::int64_t>(m_scratch_size[1]) *
+                  (static_cast<std::int64_t>(Cuda::concurrency() /
+                                             (m_team_size * m_vector_size))));
+      m_scratch_ptr[1]  = scratch_ptr_id.first;
+      m_scratch_pool_id = scratch_ptr_id.second;
+    }
 
     const int shmem_size_total = m_shmem_begin + m_shmem_size;
     if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
@@ -829,6 +835,14 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
           "Kokkos::Impl::ParallelFor< Cuda > requested too large team size."));
     }
   }
+
+  ~ParallelFor() {
+    if (m_scratch_pool_id >= 0) {
+      m_policy.space()
+          .impl_internal_space_instance()
+          ->m_team_scratch_pool[m_scratch_pool_id] = 0;
+    }
+  }
 };
 
 }  // namespace Impl
@@ -870,9 +884,24 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   using value_type     = typename ValueTraits::value_type;
   using reference_type = typename ValueTraits::reference_type;
   using functor_type   = FunctorType;
-  using size_type      = Kokkos::Cuda::size_type;
-  using index_type     = typename Policy::index_type;
-  using reducer_type   = ReducerType;
+  // Conditionally set word_size_type to int16_t or int8_t if value_type is
+  // smaller than int32_t (Kokkos::Cuda::size_type)
+  // word_size_type is used to determine the word count, shared memory buffer
+  // size, and global memory buffer size before the reduction is performed.
+  // Within the reduction, the word count is recomputed based on word_size_type
+  // and when calculating indexes into the shared/global memory buffers for
+  // performing the reduction, word_size_type is used again.
+  // For scalars > 4 bytes in size, indexing into shared/global memory relies
+  // on the block and grid dimensions to ensure that we index at the correct
+  // offset rather than at every 4 byte word; such that, when the join is
+  // performed, we have the correct data that was copied over in chunks of 4
+  // bytes.
+  using word_size_type = typename std::conditional<
+      sizeof(value_type) < sizeof(Kokkos::Cuda::size_type),
+      typename std::conditional<sizeof(value_type) == 2, int16_t, int8_t>::type,
+      Kokkos::Cuda::size_type>::type;
+  using index_type   = typename Policy::index_type;
+  using reducer_type = ReducerType;
 
   // Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
   // blockDim.z == 1
@@ -883,9 +912,11 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   const pointer_type m_result_ptr;
   const bool m_result_ptr_device_accessible;
   const bool m_result_ptr_host_accessible;
-  size_type* m_scratch_space;
-  size_type* m_scratch_flags;
-  size_type* m_unified_space;
+  word_size_type* m_scratch_space;
+  // m_scratch_flags must be of type Cuda::size_type due to use of atomics
+  // for tracking metadata in Kokkos_Cuda_ReduceScan.hpp
+  Cuda::size_type* m_scratch_flags;
+  word_size_type* m_unified_space;
 
   // Shall we use the shfl based reduction or not (only use it for static sized
   // types of more than 128bit)
@@ -924,16 +955,16 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
       __device__ inline
       void run(const DummySHMEMReductionType& ) const
       {*/
-    const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
-                                                   sizeof(size_type)>
+    const integral_nonzero_constant<
+        word_size_type, ValueTraits::StaticValueSize / sizeof(word_size_type)>
         word_count(ValueTraits::value_size(
                        ReducerConditional::select(m_functor, m_reducer)) /
-                   sizeof(size_type));
+                   sizeof(word_size_type));
 
     {
       reference_type value =
           ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                          kokkos_impl_cuda_shared_memory<size_type>() +
+                          kokkos_impl_cuda_shared_memory<word_size_type>() +
                               threadIdx.y * word_count.value);
 
       // Number of blocks is bounded so that the reduction can be limited to two
@@ -958,11 +989,12 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
       // This is the final block with the final result at the final threads'
       // location
 
-      size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() +
-                                (blockDim.y - 1) * word_count.value;
-      size_type* const global =
+      word_size_type* const shared =
+          kokkos_impl_cuda_shared_memory<word_size_type>() +
+          (blockDim.y - 1) * word_count.value;
+      word_size_type* const global =
           m_result_ptr_device_accessible
-              ? reinterpret_cast<size_type*>(m_result_ptr)
+              ? reinterpret_cast<word_size_type*>(m_result_ptr)
               : (m_unified_space ? m_unified_space : m_scratch_space);
 
       if (threadIdx.y == 0) {
@@ -985,17 +1017,17 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
         if (cuda_single_inter_block_reduce_scan<false, ReducerTypeFwd,
                                                 WorkTagFwd>(
                 ReducerConditional::select(m_functor, m_reducer), blockIdx.x,
-                gridDim.x, kokkos_impl_cuda_shared_memory<size_type>(),
+                gridDim.x, kokkos_impl_cuda_shared_memory<word_size_type>(),
                 m_scratch_space, m_scratch_flags)) {
           // This is the final block with the final result at the final threads'
           // location
 
-          size_type* const shared =
-              kokkos_impl_cuda_shared_memory<size_type>() +
+          word_size_type* const shared =
+              kokkos_impl_cuda_shared_memory<word_size_type>() +
               (blockDim.y - 1) * word_count.value;
-          size_type* const global =
+          word_size_type* const global =
               m_result_ptr_device_accessible
-                  ? reinterpret_cast<size_type*>(m_result_ptr)
+                  ? reinterpret_cast<word_size_type*>(m_result_ptr)
                   : (m_unified_space ? m_unified_space : m_scratch_space);
 
           if (threadIdx.y == 0) {
@@ -1100,15 +1132,21 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
 
       KOKKOS_ASSERT(block_size > 0);
 
-      m_scratch_space = cuda_internal_scratch_space(
+      // TODO: down casting these uses more space than required?
+      m_scratch_space = (word_size_type*)cuda_internal_scratch_space(
           m_policy.space(), ValueTraits::value_size(ReducerConditional::select(
                                 m_functor, m_reducer)) *
                                 block_size /* block_size == max block_count */);
-      m_scratch_flags =
-          cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type));
-      m_unified_space = cuda_internal_scratch_unified(
-          m_policy.space(), ValueTraits::value_size(ReducerConditional::select(
-                                m_functor, m_reducer)));
+
+      // Intentionally do not downcast to word_size_type since we use Cuda
+      // atomics in Kokkos_Cuda_ReduceScan.hpp
+      m_scratch_flags = cuda_internal_scratch_flags(m_policy.space(),
+                                                    sizeof(Cuda::size_type));
+      m_unified_space =
+          reinterpret_cast<word_size_type*>(cuda_internal_scratch_unified(
+              m_policy.space(),
+              ValueTraits::value_size(
+                  ReducerConditional::select(m_functor, m_reducer))));
 
       // REQUIRED ( 1 , N , 1 )
       dim3 block(1, block_size, 1);
@@ -1139,7 +1177,9 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
           false);  // copy to device and execute
 
       if (!m_result_ptr_device_accessible) {
-        m_policy.space().fence();
+        m_policy.space().fence(
+            "Kokkos::Impl::ParallelReduce<Cuda, RangePolicy>::execute: Result "
+            "Not Device Accessible");
 
         if (m_result_ptr) {
           if (m_unified_space) {
@@ -1459,7 +1499,9 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
           false);  // copy to device and execute
 
       if (!m_result_ptr_device_accessible) {
-        m_policy.space().fence();
+        m_policy.space().fence(
+            "Kokkos::Impl::ParallelReduce<Cuda, MDRangePolicy>::execute: "
+            "Result Not Device Accessible");
 
         if (m_result_ptr) {
           if (m_unified_space) {
@@ -1580,6 +1622,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   size_type m_shmem_size;
   void* m_scratch_ptr[2];
   int m_scratch_size[2];
+  int m_scratch_pool_id = -1;
   const size_type m_league_size;
   int m_team_size;
   const size_type m_vector_size;
@@ -1821,7 +1864,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
           true);  // copy to device and execute
 
       if (!m_result_ptr_device_accessible) {
-        m_policy.space().fence();
+        m_policy.space().fence(
+            "Kokkos::Impl::ParallelReduce<Cuda, TeamPolicy>::execute: Result "
+            "Not Device Accessible");
 
         if (m_result_ptr) {
           if (m_unified_space) {
@@ -1895,16 +1940,19 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
     m_scratch_size[0] = m_shmem_size;
     m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-    m_scratch_ptr[1] =
-        m_team_size <= 0
-            ? nullptr
-            : m_policy.space()
-                  .impl_internal_space_instance()
-                  ->resize_team_scratch_space(
-                      static_cast<std::int64_t>(m_scratch_size[1]) *
-                      (static_cast<std::int64_t>(
-                          Cuda::concurrency() /
-                          (m_team_size * m_vector_size))));
+    if (m_team_size <= 0) {
+      m_scratch_ptr[1] = nullptr;
+    } else {
+      auto scratch_ptr_id =
+          m_policy.space()
+              .impl_internal_space_instance()
+              ->resize_team_scratch_space(
+                  static_cast<std::int64_t>(m_scratch_size[1]) *
+                  (static_cast<std::int64_t>(Cuda::concurrency() /
+                                             (m_team_size * m_vector_size))));
+      m_scratch_ptr[1]  = scratch_ptr_id.first;
+      m_scratch_pool_id = scratch_ptr_id.second;
+    }
 
     // The global parallel_reduce does not support vector_length other than 1 at
     // the moment
@@ -1973,6 +2021,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     cudaFuncAttributes attr =
         CudaParallelLaunch<ParallelReduce,
                            LaunchBounds>::get_cuda_func_attributes();
+
+    // Valid team size not provided, deduce team size
     m_team_size =
         m_team_size >= 0
             ? m_team_size
@@ -1994,15 +2044,19 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
     m_scratch_size[0] = m_shmem_size;
     m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-    m_scratch_ptr[1] =
-        m_team_size <= 0
-            ? nullptr
-            : m_policy.space()
-                  .impl_internal_space_instance()
-                  ->resize_team_scratch_space(
-                      static_cast<ptrdiff_t>(m_scratch_size[1]) *
-                      static_cast<ptrdiff_t>(Cuda::concurrency() /
-                                             (m_team_size * m_vector_size)));
+    if (m_team_size <= 0) {
+      m_scratch_ptr[1] = nullptr;
+    } else {
+      auto scratch_ptr_id =
+          m_policy.space()
+              .impl_internal_space_instance()
+              ->resize_team_scratch_space(
+                  static_cast<std::int64_t>(m_scratch_size[1]) *
+                  (static_cast<std::int64_t>(Cuda::concurrency() /
+                                             (m_team_size * m_vector_size))));
+      m_scratch_ptr[1]  = scratch_ptr_id.first;
+      m_scratch_pool_id = scratch_ptr_id.second;
+    }
 
     // The global parallel_reduce does not support vector_length other than 1 at
     // the moment
@@ -2030,13 +2084,28 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
     }
-    if (int(m_team_size) >
-        arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) {
+
+    size_type team_size_max =
+        Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
+            m_policy.space().impl_internal_space_instance(), attr, m_functor,
+            m_vector_size, m_policy.team_scratch_size(0),
+            m_policy.thread_scratch_size(0)) /
+        m_vector_size;
+
+    if ((int)m_team_size > (int)team_size_max) {
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too "
                       "large team size."));
     }
   }
+
+  ~ParallelReduce() {
+    if (m_scratch_pool_id >= 0) {
+      m_policy.space()
+          .impl_internal_space_instance()
+          ->m_team_scratch_pool[m_scratch_pool_id] = 0;
+    }
+  }
 };
 
 }  // namespace Impl
@@ -2167,9 +2236,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
 
     for (typename Policy::member_type iwork_base = range.begin();
          iwork_base < range.end(); iwork_base += blockDim.y) {
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      unsigned MASK = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-#endif
+      unsigned MASK                            = __activemask();
       const typename Policy::member_type iwork = iwork_base + threadIdx.y;
 
       __syncthreads();  // Don't overwrite previous iteration values until they
@@ -2182,11 +2249,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
       for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
         shared_data[i + word_count.value] = shared_data[i] = shared_accum[i];
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK);
-#else
-      KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+      __syncwarp(MASK);
       if (CudaTraits::WarpSize < word_count.value) {
         __syncthreads();
       }  // Protect against large scan values.
@@ -2457,9 +2520,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
 
     for (typename Policy::member_type iwork_base = range.begin();
          iwork_base < range.end(); iwork_base += blockDim.y) {
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      unsigned MASK = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-#endif
+      unsigned MASK = __activemask();
 
       const typename Policy::member_type iwork = iwork_base + threadIdx.y;
 
@@ -2474,11 +2535,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
         shared_data[i + word_count.value] = shared_data[i] = shared_accum[i];
       }
 
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK);
-#else
-      KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+      __syncwarp(MASK);
       if (CudaTraits::WarpSize < word_count.value) {
         __syncthreads();
       }  // Protect against large scan values.
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
index fc9fc3770bead16eff4a0b5b6fea8b0a2039200f..e5b05bcc64f183ef98248a239e6b305fae9410ea 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -191,48 +191,28 @@ __device__ bool cuda_inter_block_reduction(
         value_type tmp = Kokkos::shfl_down(value, 1, 32);
         if (id + 1 < int(gridDim.x)) join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-      int active        = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      unsigned int mask = __activemask();
+      int active        = __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 2) {
         value_type tmp = Kokkos::shfl_down(value, 2, 32);
         if (id + 2 < int(gridDim.x)) join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 4) {
         value_type tmp = Kokkos::shfl_down(value, 4, 32);
         if (id + 4 < int(gridDim.x)) join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 8) {
         value_type tmp = Kokkos::shfl_down(value, 8, 32);
         if (id + 8 < int(gridDim.x)) join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 16) {
         value_type tmp = Kokkos::shfl_down(value, 16, 32);
         if (id + 16 < int(gridDim.x)) join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
     }
   }
   // The last block has in its thread=0 the global reduction value through
@@ -388,48 +368,28 @@ __device__ inline
         value_type tmp = Kokkos::shfl_down(value, 1, 32);
         if (id + 1 < int(gridDim.x)) reducer.join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-      int active        = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      unsigned int mask = __activemask();
+      int active        = __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 2) {
         value_type tmp = Kokkos::shfl_down(value, 2, 32);
         if (id + 2 < int(gridDim.x)) reducer.join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 4) {
         value_type tmp = Kokkos::shfl_down(value, 4, 32);
         if (id + 4 < int(gridDim.x)) reducer.join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 8) {
         value_type tmp = Kokkos::shfl_down(value, 8, 32);
         if (id + 8 < int(gridDim.x)) reducer.join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 16) {
         value_type tmp = Kokkos::shfl_down(value, 16, 32);
         if (id + 16 < int(gridDim.x)) reducer.join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
     }
   }
 
@@ -573,23 +533,17 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
                                // part of the reduction
       const int width)         // How much of the warp participates
   {
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
     unsigned mask =
         width == 32
             ? 0xffffffff
             : ((1 << width) - 1)
                   << ((threadIdx.y * blockDim.x + threadIdx.x) / width) * width;
-#endif
     const int lane_id = (threadIdx.y * blockDim.x + threadIdx.x) % 32;
     for (int delta = skip_vector ? blockDim.x : 1; delta < width; delta *= 2) {
       if (lane_id + delta < 32) {
         ValueJoin::join(functor, value, value + delta);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      KOKKOS_IMPL_CUDA_SYNCWARP_MASK(mask);
-#else
-      KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+      __syncwarp(mask);
     }
     *value = *(value - lane_id);
   }
@@ -612,17 +566,18 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
       const unsigned int delta = (threadIdx.y * blockDim.x + threadIdx.x) * 32;
       if (delta < blockDim.x * blockDim.y)
         *my_shared_team_buffer_element = shared_team_buffer_element[delta];
-      KOKKOS_IMPL_CUDA_SYNCWARP;
+      __syncwarp(0xffffffff);
       scalar_intra_warp_reduction(functor, my_shared_team_buffer_element, false,
                                   blockDim.x * blockDim.y / 32);
       if (threadIdx.x + threadIdx.y == 0) *result = *shared_team_buffer_element;
     }
   }
 
+  template <class SizeType = Cuda::size_type>
   __device__ static inline bool scalar_inter_block_reduction(
       const FunctorType& functor, const Cuda::size_type /*block_id*/,
-      const Cuda::size_type block_count, Cuda::size_type* const shared_data,
-      Cuda::size_type* const global_data, Cuda::size_type* const global_flags) {
+      const Cuda::size_type block_count, SizeType* const shared_data,
+      SizeType* const global_data, Cuda::size_type* const global_flags) {
     Scalar* const global_team_buffer_element = ((Scalar*)global_data);
     Scalar* const my_global_team_buffer_element =
         global_team_buffer_element + blockIdx.x;
@@ -713,17 +668,17 @@ __device__ void cuda_intra_block_reduce_scan(
   const pointer_type tdata_intra = base_data + value_count * threadIdx.y;
 
   {  // Intra-warp reduction:
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 0)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 1)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 2)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 3)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 4)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
   }
 
   __syncthreads();  // Wait for all warps to reduce
@@ -732,57 +687,31 @@ __device__ void cuda_intra_block_reduce_scan(
     const unsigned rtid_inter = (threadIdx.y ^ BlockSizeMask)
                                 << CudaTraits::WarpIndexShift;
 
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    unsigned inner_mask =
-        KOKKOS_IMPL_CUDA_BALLOT_MASK(0xffffffff, (rtid_inter < blockDim.y));
-#endif
+    unsigned inner_mask = __ballot_sync(0xffffffff, (rtid_inter < blockDim.y));
     if (rtid_inter < blockDim.y) {
       const pointer_type tdata_inter =
           base_data + value_count * (rtid_inter ^ BlockSizeMask);
 
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
       if ((1 << 5) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 5)
       }
       if ((1 << 6) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 6)
       }
       if ((1 << 7) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 7)
       }
       if ((1 << 8) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 8)
       }
       if ((1 << 9) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 9)
       }
-#else
-      if ((1 << 5) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 5)
-      }
-      if ((1 << 6) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 6)
-      }
-      if ((1 << 7) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 7)
-      }
-      if ((1 << 8) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 8)
-      }
-      if ((1 << 9) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 9)
-      }
-#endif
 
       if (DoScan) {
         int n =
@@ -795,25 +724,14 @@ __device__ void cuda_intra_block_reduce_scan(
 
         if (!(rtid_inter + n < blockDim.y)) n = 0;
 
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
-        BLOCK_SCAN_STEP(tdata_inter, n, 8)
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
-        BLOCK_SCAN_STEP(tdata_inter, n, 7)
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
-        BLOCK_SCAN_STEP(tdata_inter, n, 6)
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
-        BLOCK_SCAN_STEP(tdata_inter, n, 5)
-#else
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(inner_mask);
         BLOCK_SCAN_STEP(tdata_inter, n, 8)
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(inner_mask);
         BLOCK_SCAN_STEP(tdata_inter, n, 7)
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(inner_mask);
         BLOCK_SCAN_STEP(tdata_inter, n, 6)
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(inner_mask);
         BLOCK_SCAN_STEP(tdata_inter, n, 5)
-#endif
       }
     }
   }
@@ -832,17 +750,17 @@ __device__ void cuda_intra_block_reduce_scan(
                                               : ((rtid_intra & 16) ? 16 : 0))));
 
     if (!(rtid_intra + n < blockDim.y)) n = 0;
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_SCAN_STEP(tdata_intra, n, 4) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_SCAN_STEP(tdata_intra, n, 3) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_SCAN_STEP(tdata_intra, n, 2) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_SCAN_STEP(tdata_intra, n, 1) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_SCAN_STEP(tdata_intra, n, 0) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
   }
 
 #undef BLOCK_SCAN_STEP
@@ -858,12 +776,13 @@ __device__ void cuda_intra_block_reduce_scan(
  *  Global reduce result is in the last threads' 'shared_data' location.
  */
 
-template <bool DoScan, class FunctorType, class ArgTag>
+template <bool DoScan, class FunctorType, class ArgTag,
+          class SizeType = Cuda::size_type>
 __device__ bool cuda_single_inter_block_reduce_scan2(
     const FunctorType& functor, const Cuda::size_type block_id,
-    const Cuda::size_type block_count, Cuda::size_type* const shared_data,
-    Cuda::size_type* const global_data, Cuda::size_type* const global_flags) {
-  using size_type   = Cuda::size_type;
+    const Cuda::size_type block_count, SizeType* const shared_data,
+    SizeType* const global_data, Cuda::size_type* const global_flags) {
+  using size_type   = SizeType;
   using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>;
   using ValueJoin   = FunctorValueJoin<FunctorType, ArgTag>;
   using ValueInit   = FunctorValueInit<FunctorType, ArgTag>;
@@ -953,11 +872,12 @@ __device__ bool cuda_single_inter_block_reduce_scan2(
   return is_last_block;
 }
 
-template <bool DoScan, class FunctorType, class ArgTag>
+template <bool DoScan, class FunctorType, class ArgTag,
+          class SizeType = Cuda::size_type>
 __device__ bool cuda_single_inter_block_reduce_scan(
     const FunctorType& functor, const Cuda::size_type block_id,
-    const Cuda::size_type block_count, Cuda::size_type* const shared_data,
-    Cuda::size_type* const global_data, Cuda::size_type* const global_flags) {
+    const Cuda::size_type block_count, SizeType* const shared_data,
+    SizeType* const global_data, Cuda::size_type* const global_flags) {
   using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>;
   if (!DoScan && ValueTraits::StaticValueSize > 0)
     return Kokkos::Impl::CudaReductionsFunctor<
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
index 2004edbeacdb4b5b309ea3bd6eb83b3abcfacea6..88ac0d1878a911a876210fe06cc52fa1d8285be6 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
@@ -54,11 +54,27 @@
 #include <Kokkos_Core_fwd.hpp>
 
 #include <impl/Kokkos_TaskBase.hpp>
-#include <Cuda/Kokkos_Cuda_Error.hpp>  // CUDA_SAFE_CALL
+#include <Cuda/Kokkos_Cuda_Error.hpp>  // KOKKOS_IMPL_CUDA_SAFE_CALL
 #include <impl/Kokkos_TaskTeamMember.hpp>
 
 //----------------------------------------------------------------------------
 
+#if defined(__CUDA_ARCH__)
+#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG)                           \
+  {                                                                        \
+    __syncwarp();                                                          \
+    const unsigned b = __activemask();                                     \
+    if (b != 0xffffffff) {                                                 \
+      printf(" SYNCWARP AT %s (%d,%d,%d) (%d,%d,%d) failed %x\n", MSG,     \
+             blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, \
+             threadIdx.z, b);                                              \
+      return;                                                              \
+    }                                                                      \
+  }
+#else
+#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG)
+#endif
+
 namespace Kokkos {
 namespace Impl {
 namespace {
@@ -138,13 +154,13 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
       // Broadcast task pointer:
 
       // Sync before the broadcast
-      KOKKOS_IMPL_CUDA_SYNCWARP;
+      __syncwarp(0xffffffff);
 
       // pretend it's an int* for shuffle purposes
       ((int*)&current_task)[0] =
-          KOKKOS_IMPL_CUDA_SHFL(((int*)&current_task)[0], 0, 32);
+          __shfl_sync(0xffffffff, ((int*)&current_task)[0], 0, 32);
       ((int*)&current_task)[1] =
-          KOKKOS_IMPL_CUDA_SHFL(((int*)&current_task)[1], 0, 32);
+          __shfl_sync(0xffffffff, ((int*)&current_task)[1], 0, 32);
 
       if (current_task) {
         KOKKOS_ASSERT(!current_task->as_runnable_task().get_respawn_flag());
@@ -168,7 +184,7 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
 
         // Synchronize threads of the warp and insure memory
         // writes are visible to all threads in the warp.
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);
 
         if (shared_memory_task_copy->is_team_runnable()) {
           // Thread Team Task
@@ -182,7 +198,7 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
         // Synchronize threads of the warp and insure memory
         // writes are visible to all threads in the warp.
 
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);
 
         // if(warp_lane < b % CudaTraits::WarpSize) b += CudaTraits::WarpSize;
         // b -= b % CudaTraits::WarpSize;
@@ -196,7 +212,7 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
         // writes are visible to root thread of the warp for
         // respawn or completion.
 
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);
 
         if (warp_lane == 0) {
           // If respawn requested copy respawn data back to main memory
@@ -249,12 +265,14 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
 
     auto& queue = scheduler.queue();
 
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::"
+        "Cuda>::execute: Pre Task Execution");
 
     // Query the stack size, in bytes:
 
     size_t previous_stack_size = 0;
-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
         cudaDeviceGetLimit(&previous_stack_size, cudaLimitStackSize));
 
     // If not large enough then set the stack size, in bytes:
@@ -262,18 +280,21 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
     const size_t larger_stack_size = 1 << 11;
 
     if (previous_stack_size < larger_stack_size) {
-      CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
     }
 
     cuda_task_queue_execute<<<grid, block, shared_total, stream>>>(
         scheduler, shared_per_warp);
 
-    CUDA_SAFE_CALL(cudaGetLastError());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
 
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::"
+        "Cuda>::execute: Post Task Execution");
 
     if (previous_stack_size < larger_stack_size) {
-      CUDA_SAFE_CALL(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
           cudaDeviceSetLimit(cudaLimitStackSize, previous_stack_size));
     }
   }
@@ -295,13 +316,17 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
     destroy_type* dtor_ptr =
         (destroy_type*)((char*)storage + sizeof(function_type));
 
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::"
+        "Cuda>::execute: Pre Get Function Pointer for Tasks");
 
     set_cuda_task_base_apply_function_pointer<TaskType>
         <<<1, 1>>>(ptr_ptr, dtor_ptr);
 
-    CUDA_SAFE_CALL(cudaGetLastError());
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::"
+        "Cuda>::execute: Post Get Function Pointer for Tasks");
 
     ptr  = *ptr_ptr;
     dtor = *dtor_ptr;
@@ -372,23 +397,20 @@ class TaskQueueSpecializationConstrained<
           // count of 0 also. Otherwise, returns a task from another queue
           // or `end` if one couldn't be popped
           task_ptr = team_queue.attempt_to_steal_task();
-#if 0
-          if(task != no_more_tasks_sentinel && task != end) {
-            std::printf("task stolen on rank %d\n", team_exec.league_rank());
-          }
-#endif
         }
       }
 
       // Synchronize warp with memory fence before broadcasting task pointer:
 
       // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "A" );
-      KOKKOS_IMPL_CUDA_SYNCWARP;
+      __syncwarp(0xffffffff);
 
       // Broadcast task pointer:
 
-      ((int*)&task_ptr)[0] = KOKKOS_IMPL_CUDA_SHFL(((int*)&task_ptr)[0], 0, 32);
-      ((int*)&task_ptr)[1] = KOKKOS_IMPL_CUDA_SHFL(((int*)&task_ptr)[1], 0, 32);
+      ((int*)&task_ptr)[0] =
+          __shfl_sync(0xffffffff, ((int*)&task_ptr)[0], 0, 32);
+      ((int*)&task_ptr)[1] =
+          __shfl_sync(0xffffffff, ((int*)&task_ptr)[1], 0, 32);
 
 #if defined(KOKKOS_ENABLE_DEBUG)
       KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN("TaskQueue CUDA task_ptr");
@@ -418,7 +440,7 @@ class TaskQueueSpecializationConstrained<
         // writes are visible to all threads in the warp.
 
         // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "B" );
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);
 
         if (task_root_type::TaskTeam == task_shmem->m_task_type) {
           // Thread Team Task
@@ -432,7 +454,7 @@ class TaskQueueSpecializationConstrained<
         // writes are visible to all threads in the warp.
 
         // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "C" );
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);
 
         // copy task closure from shared to global memory:
 
@@ -445,7 +467,7 @@ class TaskQueueSpecializationConstrained<
         // respawn or completion.
 
         // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "D" );
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);
 
         // If respawn requested copy respawn data back to main memory
 
@@ -475,12 +497,14 @@ class TaskQueueSpecializationConstrained<
     auto& queue = scheduler.queue();
     queue.initialize_team_queues(warps_per_block * grid.x);
 
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<"
+        "Kokkos::Cuda>::execute: Pre Execute Task");
 
     // Query the stack size, in bytes:
 
     size_t previous_stack_size = 0;
-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
         cudaDeviceGetLimit(&previous_stack_size, cudaLimitStackSize));
 
     // If not large enough then set the stack size, in bytes:
@@ -488,18 +512,21 @@ class TaskQueueSpecializationConstrained<
     const size_t larger_stack_size = 2048;
 
     if (previous_stack_size < larger_stack_size) {
-      CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
     }
 
     cuda_task_queue_execute<<<grid, block, shared_total, stream>>>(
         scheduler, shared_per_warp);
 
-    CUDA_SAFE_CALL(cudaGetLastError());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
 
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<"
+        "Kokkos::Cuda>::execute: Post Execute Task");
 
     if (previous_stack_size < larger_stack_size) {
-      CUDA_SAFE_CALL(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
           cudaDeviceSetLimit(cudaLimitStackSize, previous_stack_size));
     }
   }
@@ -516,13 +543,17 @@ class TaskQueueSpecializationConstrained<
     destroy_type* dtor_ptr =
         (destroy_type*)((char*)storage + sizeof(function_type));
 
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<"
+        "Kokkos::Cuda>::get_function_pointer: Pre Get Function Pointer");
 
     set_cuda_task_base_apply_function_pointer<TaskType>
         <<<1, 1>>>(ptr_ptr, dtor_ptr);
 
-    CUDA_SAFE_CALL(cudaGetLastError());
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<"
+        "Kokkos::Cuda>::get_function_pointer: Post Get Function Pointer");
 
     ptr  = *ptr_ptr;
     dtor = *dtor_ptr;
@@ -609,7 +640,7 @@ class TaskExec<Kokkos::Cuda, Scheduler> {
 
   __device__ void team_barrier() const {
     if (1 < m_team_size) {
-      KOKKOS_IMPL_CUDA_SYNCWARP;
+      __syncwarp(0xffffffff);
     }
   }
 
@@ -1205,5 +1236,7 @@ KOKKOS_INLINE_FUNCTION void single(
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
+#undef KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN
+
 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
 #endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
index e7806390155d46fd811a21432d9f9d268c457468..922b980a2545b4e35d573d44806d76fdf1ca1ea2 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
@@ -340,191 +340,6 @@ class CudaTeamMember {
 #endif
   }
 
-  //--------------------------------------------------------------------------
-  /**\brief  Global reduction across all blocks
-   *
-   *  Return !0 if reducer contains the final value
-   */
-  template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
-      typename std::enable_if<is_reducer<ReducerType>::value, int>::type
-      global_reduce(ReducerType const& reducer, int* const global_scratch_flags,
-                    void* const global_scratch_space, void* const shmem,
-                    int const shmem_size) {
-#ifdef __CUDA_ARCH__
-
-    using value_type   = typename ReducerType::value_type;
-    using pointer_type = value_type volatile*;
-
-    // Number of shared memory entries for the reduction:
-    const int nsh = shmem_size / sizeof(value_type);
-
-    // Number of CUDA threads in the block, rank within the block
-    const int nid = blockDim.x * blockDim.y * blockDim.z;
-    const int tid =
-        threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
-
-    // Reduces within block using all available shared memory
-    // Contributes if it is the root "vector lane"
-
-    // wn == number of warps in the block
-    // wx == which lane within the warp
-    // wy == which warp within the block
-
-    const int wn =
-        (nid + CudaTraits::WarpIndexMask) >> CudaTraits::WarpIndexShift;
-    const int wx = tid & CudaTraits::WarpIndexMask;
-    const int wy = tid >> CudaTraits::WarpIndexShift;
-
-    //------------------------
-    {  // Intra warp shuffle reduction from contributing CUDA threads
-
-      value_type tmp(reducer.reference());
-
-      for (int i = CudaTraits::WarpSize; (int)blockDim.x <= (i >>= 1);) {
-        Impl::in_place_shfl_down(reducer.reference(), tmp, i,
-                                 CudaTraits::WarpSize);
-
-        // Root of each vector lane reduces "thread" contribution
-        if (0 == threadIdx.x && wx < i) {
-          reducer.join(&tmp, reducer.data());
-        }
-      }
-
-      // Reduce across warps using shared memory.
-      // Number of warps may not be power of two.
-
-      __syncthreads();  // Wait before shared data write
-
-      // Number of shared memory entries for the reduction
-      // is at most one per warp
-      const int nentry = wn < nsh ? wn : nsh;
-
-      if (0 == wx && wy < nentry) {
-        // Root thread of warp 'wy' has warp's value to contribute
-        ((value_type*)shmem)[wy] = tmp;
-      }
-
-      __syncthreads();  // Wait for write to be visible to block
-
-      // When more warps than shared entries
-      // then warps must take turns joining their contribution
-      // to the designated shared memory entry.
-      for (int i = nentry; i < wn; i += nentry) {
-        const int k = wy - i;
-
-        if (0 == wx && i <= wy && k < nentry) {
-          // Root thread of warp 'wy' has warp's value to contribute
-          reducer.join(((value_type*)shmem) + k, &tmp);
-        }
-
-        __syncthreads();  // Wait for write to be visible to block
-      }
-
-      // One warp performs the inter-warp reduction:
-
-      if (0 == wy) {
-        // Start fan-in at power of two covering nentry
-
-        for (int i = (1 << (32 - __clz(nentry - 1))); (i >>= 1);) {
-          const int k = wx + i;
-          if (wx < i && k < nentry) {
-            reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + k);
-            __threadfence_block();  // Wait for write to be visible to warp
-          }
-        }
-      }
-    }
-    //------------------------
-    {  // Write block's value to global_scratch_memory
-
-      int last_block = 0;
-
-      if (0 == wx) {
-        reducer.copy(((pointer_type)global_scratch_space) +
-                         blockIdx.x * reducer.length(),
-                     reducer.data());
-
-        __threadfence();  // Wait until global write is visible.
-
-        last_block = (int)gridDim.x ==
-                     1 + Kokkos::atomic_fetch_add(global_scratch_flags, 1);
-
-        // If last block then reset count
-        if (last_block) *global_scratch_flags = 0;
-      }
-
-      last_block = __syncthreads_or(last_block);
-
-      if (!last_block) return 0;
-    }
-    //------------------------
-    // Last block reads global_scratch_memory into shared memory.
-
-    const int nentry = nid < gridDim.x ? (nid < nsh ? nid : nsh)
-                                       : (gridDim.x < nsh ? gridDim.x : nsh);
-
-    // nentry = min( nid , nsh , gridDim.x )
-
-    // whole block reads global memory into shared memory:
-
-    if (tid < nentry) {
-      const int offset = tid * reducer.length();
-
-      reducer.copy(((pointer_type)shmem) + offset,
-                   ((pointer_type)global_scratch_space) + offset);
-
-      for (int i = nentry + tid; i < (int)gridDim.x; i += nentry) {
-        reducer.join(
-            ((pointer_type)shmem) + offset,
-            ((pointer_type)global_scratch_space) + i * reducer.length());
-      }
-    }
-
-    __syncthreads();  // Wait for writes to be visible to block
-
-    if (0 == wy) {
-      // Iterate to reduce shared memory to single warp fan-in size
-
-      const int nreduce =
-          CudaTraits::WarpSize < nentry ? CudaTraits::WarpSize : nentry;
-
-      // nreduce = min( CudaTraits::WarpSize , nsh , gridDim.x )
-
-      if (wx < nreduce && nreduce < nentry) {
-        for (int i = nreduce + wx; i < nentry; i += nreduce) {
-          reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + i);
-        }
-        __threadfence_block();  // Wait for writes to be visible to warp
-      }
-
-      // Start fan-in at power of two covering nentry
-
-      for (int i = (1 << (32 - __clz(nreduce - 1))); (i >>= 1);) {
-        const int k = wx + i;
-        if (wx < i && k < nreduce) {
-          reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + k);
-          __threadfence_block();  // Wait for writes to be visible to warp
-        }
-      }
-
-      if (0 == wx) {
-        reducer.copy(reducer.data(), (pointer_type)shmem);
-        return 1;
-      }
-    }
-    return 0;
-
-#else
-    (void)reducer;
-    (void)global_scratch_flags;
-    (void)global_scratch_space;
-    (void)shmem;
-    (void)shmem_size;
-    return 0;
-#endif
-  }
-
   //----------------------------------------
   // Private for the driver
 
@@ -533,7 +348,7 @@ class CudaTeamMember {
                  void* scratch_level_1_ptr, const int scratch_level_1_size,
                  const int arg_league_rank, const int arg_league_size)
       : m_team_reduce(shared),
-        m_team_shared(((char*)shared) + shared_begin, shared_size,
+        m_team_shared(static_cast<char*>(shared) + shared_begin, shared_size,
                       scratch_level_1_ptr, scratch_level_1_size),
         m_team_reduce_size(shared_begin),
         m_league_rank(arg_league_rank),
@@ -854,14 +669,10 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
        i += blockDim.x) {
     closure(i);
   }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  KOKKOS_IMPL_CUDA_SYNCWARP_MASK(
-      blockDim.x == 32 ? 0xffffffff
-                       : ((1 << blockDim.x) - 1)
-                             << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
-#else
-  KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+  __syncwarp(blockDim.x == 32
+                 ? 0xffffffff
+                 : ((1 << blockDim.x) - 1)
+                       << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
 #endif
 }
 
@@ -1100,14 +911,10 @@ KOKKOS_INLINE_FUNCTION void single(
   (void)lambda;
 #ifdef __CUDA_ARCH__
   if (threadIdx.x == 0) lambda();
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  KOKKOS_IMPL_CUDA_SYNCWARP_MASK(
-      blockDim.x == 32 ? 0xffffffff
-                       : ((1 << blockDim.x) - 1)
-                             << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
-#else
-  KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+  __syncwarp(blockDim.x == 32
+                 ? 0xffffffff
+                 : ((1 << blockDim.x) - 1)
+                       << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
 #endif
 }
 
@@ -1118,14 +925,10 @@ KOKKOS_INLINE_FUNCTION void single(
   (void)lambda;
 #ifdef __CUDA_ARCH__
   if (threadIdx.x == 0 && threadIdx.y == 0) lambda();
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  KOKKOS_IMPL_CUDA_SYNCWARP_MASK(
-      blockDim.x == 32 ? 0xffffffff
-                       : ((1 << blockDim.x) - 1)
-                             << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
-#else
-  KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+  __syncwarp(blockDim.x == 32
+                 ? 0xffffffff
+                 : ((1 << blockDim.x) - 1)
+                       << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
 #endif
 }
 
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
index 7f7b7b6e78adc3de9d5ae446565eedc7d00439f5..31d3c47e1c9c9af3b6c6d8c918abe01dd0b238fe 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
@@ -48,7 +48,12 @@
 #ifdef KOKKOS_ENABLE_CUDA
 
 #include <type_traits>
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
+
+#if !defined(KOKKOS_COMPILER_CLANG)
+#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(long long)
+#else
+#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(int)
+#endif
 
 namespace Kokkos {
 
@@ -61,7 +66,7 @@ constexpr unsigned shfl_all_mask = 0xffffffffu;
 // Shuffle operations require input to be a register (stack) variable
 
 // Derived implements do_shfl_op(unsigned mask, T& in, int lane, int width),
-// which turns in to one of KOKKOS_IMPL_CUDA_SHFL(_UP_|_DOWN_|_)MASK
+// which turns in to one of __shfl_sync(_up|_down)
 // Since the logic with respect to value sizes, etc., is the same everywhere,
 // put it all in one place.
 template <class Derived>
@@ -157,7 +162,7 @@ struct in_place_shfl_fn : in_place_shfl_op<in_place_shfl_fn> {
     (void)val;
     (void)lane;
     (void)width;
-    return KOKKOS_IMPL_CUDA_SHFL_MASK(mask, val, lane, width);
+    return __shfl_sync(mask, val, lane, width);
   }
 };
 template <class... Args>
@@ -170,7 +175,7 @@ struct in_place_shfl_up_fn : in_place_shfl_op<in_place_shfl_up_fn> {
   __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val,
                                                   int lane, int width) const
       noexcept {
-    return KOKKOS_IMPL_CUDA_SHFL_UP_MASK(mask, val, lane, width);
+    return __shfl_up_sync(mask, val, lane, width);
   }
 };
 template <class... Args>
@@ -188,7 +193,7 @@ struct in_place_shfl_down_fn : in_place_shfl_op<in_place_shfl_down_fn> {
     (void)val;
     (void)lane;
     (void)width;
-    return KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(mask, val, lane, width);
+    return __shfl_down_sync(mask, val, lane, width);
   }
 };
 template <class... Args>
@@ -228,5 +233,7 @@ __device__ inline T shfl_up(const T& val, int delta, int width,
 
 }  // end namespace Kokkos
 
+#undef KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF
+
 #endif  // defined( KOKKOS_ENABLE_CUDA )
 #endif  // !defined( KOKKOS_CUDA_VECTORIZATION_HPP )
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp
deleted file mode 100644
index 0cdd84ce27157e118065c6fbcf2da71a875b81e0..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-#include <Kokkos_Macros.hpp>
-
-#if defined(__CUDA_ARCH__)
-#define KOKKOS_IMPL_CUDA_ACTIVEMASK __activemask()
-#define KOKKOS_IMPL_CUDA_SYNCWARP __syncwarp(0xffffffff)
-#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) __syncwarp(m)
-#define KOKKOS_IMPL_CUDA_BALLOT(x) __ballot_sync(__activemask(), x)
-#define KOKKOS_IMPL_CUDA_BALLOT_MASK(m, x) __ballot_sync(m, x)
-#define KOKKOS_IMPL_CUDA_SHFL(x, y, z) __shfl_sync(0xffffffff, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_MASK(m, x, y, z) __shfl_sync(m, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_UP(x, y, z) __shfl_up_sync(0xffffffff, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_UP_MASK(m, x, y, z) __shfl_up_sync(m, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_DOWN(x, y, z) \
-  __shfl_down_sync(0xffffffff, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m, x, y, z) __shfl_down_sync(m, x, y, z)
-#else
-#define KOKKOS_IMPL_CUDA_ACTIVEMASK 0
-#define KOKKOS_IMPL_CUDA_SYNCWARP
-#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) (void)m
-#define KOKKOS_IMPL_CUDA_BALLOT(x) 0
-#define KOKKOS_IMPL_CUDA_BALLOT_MASK(m, x) 0
-#define KOKKOS_IMPL_CUDA_SHFL(x, y, z) 0
-#define KOKKOS_IMPL_CUDA_SHFL_MASK(m, x, y, z) 0
-#define KOKKOS_IMPL_CUDA_SHFL_UP(x, y, z) 0
-#define KOKKOS_IMPL_CUDA_SHFL_DOWN(x, y, z) 0
-#define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m, x, y, z) 0
-#endif
-
-#if !defined(KOKKOS_COMPILER_CLANG)
-#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(long long)
-#else
-#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(int)
-#endif
-
-#if defined(__CUDA_ARCH__)
-#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG)                           \
-  {                                                                        \
-    __syncwarp();                                                          \
-    const unsigned b = __activemask();                                     \
-    if (b != 0xffffffff) {                                                 \
-      printf(" SYNCWARP AT %s (%d,%d,%d) (%d,%d,%d) failed %x\n", MSG,     \
-             blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, \
-             threadIdx.z, b);                                              \
-      return;                                                              \
-    }                                                                      \
-  }
-#else
-#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG)
-#endif
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp
index 9278d1bdc9efcc2a76183085c974afef41413e3c..7eb3e1e9f70fe4cf724e3b766e38ebc16b3c7c8f 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp
@@ -45,6 +45,7 @@
 #ifndef KOKKOS_HIP_BLOCKSIZE_DEDUCTION_HPP
 #define KOKKOS_HIP_BLOCKSIZE_DEDUCTION_HPP
 
+#include <functional>
 #include <Kokkos_Macros.hpp>
 
 #if defined(__HIPCC__)
@@ -56,118 +57,239 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
-template <typename DriverType, bool, int MaxThreadsPerBlock, int MinBlocksPerSM>
-void hipOccupancy(int *numBlocks, int blockSize, int sharedmem) {
-  // FIXME_HIP - currently the "constant" path is unimplemented.
-  //             we should look at whether it's functional, and
-  //             perform some simple scaling studies to see when /
-  //             if the constant launcher outperforms the current
-  //             pass by pointer shared launcher
-  HIP_SAFE_CALL(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-      numBlocks,
-      hip_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
-                                       MinBlocksPerSM>,
-      blockSize, sharedmem));
-}
+enum class BlockType { Max, Preferred };
 
-template <typename DriverType, bool constant>
-void hipOccupancy(int *numBlocks, int blockSize, int sharedmem) {
-  hipOccupancy<DriverType, constant, HIPTraits::MaxThreadsPerBlock, 1>(
-      numBlocks, blockSize, sharedmem);
+template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
+unsigned get_preferred_blocksize_impl() {
+  // FIXME_HIP - could be if constexpr for c++17
+  if (!HIPParallelLaunch<DriverType, LaunchBounds,
+                         LaunchMechanism>::default_launchbounds()) {
+    // use the user specified value
+    return LaunchBounds::maxTperB;
+  } else {
+    if (HIPParallelLaunch<DriverType, LaunchBounds,
+                          LaunchMechanism>::get_scratch_size() > 0) {
+      return HIPTraits::ConservativeThreadsPerBlock;
+    }
+    return HIPTraits::MaxThreadsPerBlock;
+  }
 }
 
-template <class FunctorType, class LaunchBounds, typename F>
-int hip_internal_get_block_size(const F &condition_check,
-                                const HIPInternal *hip_instance,
-                                const hipFuncAttributes &attr,
-                                const FunctorType &f,
-                                const size_t vector_length,
-                                const size_t shmem_block,
-                                const size_t shmem_thread) {
-  const int min_blocks_per_sm =
-      LaunchBounds::minBperSM == 0 ? 1 : LaunchBounds::minBperSM;
-  const int max_threads_per_block = LaunchBounds::maxTperB == 0
-                                        ? HIPTraits::MaxThreadsPerBlock
-                                        : LaunchBounds::maxTperB;
-
-  const int regs_per_wavefront  = std::max(attr.numRegs, 1);
-  const int regs_per_sm         = hip_instance->m_regsPerSM;
-  const int shmem_per_sm        = hip_instance->m_shmemPerSM;
-  const int max_shmem_per_block = hip_instance->m_maxShmemPerBlock;
-  const int max_blocks_per_sm   = hip_instance->m_maxBlocksPerSM;
-  const int max_threads_per_sm  = hip_instance->m_maxThreadsPerSM;
-
-  int block_size = max_threads_per_block;
-  KOKKOS_ASSERT(block_size > 0);
-  const int blocks_per_warp =
-      (block_size + HIPTraits::WarpSize - 1) / HIPTraits::WarpSize;
-
-  int functor_shmem = ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
-      f, block_size / vector_length);
-  int total_shmem = shmem_block + shmem_thread * (block_size / vector_length) +
-                    functor_shmem + attr.sharedSizeBytes;
-  int max_blocks_regs = regs_per_sm / (regs_per_wavefront * blocks_per_warp);
-  int max_blocks_shmem =
-      (total_shmem < max_shmem_per_block)
-          ? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs)
-          : 0;
-  int blocks_per_sm  = std::min(max_blocks_regs, max_blocks_shmem);
-  int threads_per_sm = blocks_per_sm * block_size;
-  if (threads_per_sm > max_threads_per_sm) {
-    blocks_per_sm  = max_threads_per_sm / block_size;
-    threads_per_sm = blocks_per_sm * block_size;
+// FIXME_HIP - entire function could be constexpr for c++17
+template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
+unsigned get_max_blocksize_impl() {
+  // FIXME_HIP - could be if constexpr for c++17
+  if (!HIPParallelLaunch<DriverType, LaunchBounds,
+                         LaunchMechanism>::default_launchbounds()) {
+    // use the user specified value
+    return LaunchBounds::maxTperB;
+  } else {
+    // we can always fit 1024 threads blocks if we only care about registers
+    // ... and don't mind spilling
+    return HIPTraits::MaxThreadsPerBlock;
   }
-  int opt_block_size =
-      (blocks_per_sm >= min_blocks_per_sm) ? block_size : min_blocks_per_sm;
-  int opt_threads_per_sm = threads_per_sm;
-  block_size -= HIPTraits::WarpSize;
-  while (condition_check(blocks_per_sm) &&
-         (block_size >= HIPTraits::WarpSize)) {
-    functor_shmem = ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
-        f, block_size / vector_length);
-    total_shmem = shmem_block + shmem_thread * (block_size / vector_length) +
-                  functor_shmem + attr.sharedSizeBytes;
-    max_blocks_regs = regs_per_sm / (regs_per_wavefront * blocks_per_warp);
-    max_blocks_shmem =
-        (total_shmem < max_shmem_per_block)
-            ? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs)
-            : 0;
-    blocks_per_sm  = std::min(max_blocks_regs, max_blocks_shmem);
-    threads_per_sm = blocks_per_sm * block_size;
-    if (threads_per_sm > max_threads_per_sm) {
-      blocks_per_sm  = max_threads_per_sm / block_size;
-      threads_per_sm = blocks_per_sm * block_size;
-    }
-    if ((blocks_per_sm >= min_blocks_per_sm) &&
-        (blocks_per_sm <= max_blocks_per_sm)) {
-      if (threads_per_sm >= opt_threads_per_sm) {
-        opt_block_size     = block_size;
-        opt_threads_per_sm = threads_per_sm;
+}
+
+// convenience method to select and return the proper function attributes
+// for a kernel, given the launch bounds et al.
+template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
+          BlockType BlockSize = BlockType::Max,
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
+hipFuncAttributes get_hip_func_attributes_impl() {
+  // FIXME_HIP - could be if constexpr for c++17
+  if (!HIPParallelLaunch<DriverType, LaunchBounds,
+                         LaunchMechanism>::default_launchbounds()) {
+    // for user defined, we *always* honor the request
+    return HIPParallelLaunch<DriverType, LaunchBounds,
+                             LaunchMechanism>::get_hip_func_attributes();
+  } else {
+    // FIXME_HIP - could be if constexpr for c++17
+    if (BlockSize == BlockType::Max) {
+      return HIPParallelLaunch<
+          DriverType, Kokkos::LaunchBounds<HIPTraits::MaxThreadsPerBlock, 1>,
+          LaunchMechanism>::get_hip_func_attributes();
+    } else {
+      const int blocksize =
+          get_preferred_blocksize_impl<DriverType, LaunchBounds,
+                                       LaunchMechanism>();
+      if (blocksize == HIPTraits::MaxThreadsPerBlock) {
+        return HIPParallelLaunch<
+            DriverType, Kokkos::LaunchBounds<HIPTraits::MaxThreadsPerBlock, 1>,
+            LaunchMechanism>::get_hip_func_attributes();
+      } else {
+        return HIPParallelLaunch<
+            DriverType,
+            Kokkos::LaunchBounds<HIPTraits::ConservativeThreadsPerBlock, 1>,
+            LaunchMechanism>::get_hip_func_attributes();
       }
     }
-    block_size -= HIPTraits::WarpSize;
   }
-  return opt_block_size;
 }
 
-template <class FunctorType, class LaunchBounds>
-int hip_get_max_block_size(const HIPInternal *hip_instance,
-                           const hipFuncAttributes &attr, const FunctorType &f,
-                           const size_t vector_length, const size_t shmem_block,
-                           const size_t shmem_thread) {
-  return hip_internal_get_block_size<FunctorType, LaunchBounds>(
-      [](int x) { return x == 0; }, hip_instance, attr, f, vector_length,
-      shmem_block, shmem_thread);
+// Given an initial block-size limitation based on register usage
+// determine the block size to select based on LDS limitation
+template <BlockType BlockSize, class DriverType, class LaunchBounds,
+          typename ShmemFunctor>
+unsigned hip_internal_get_block_size(const HIPInternal *hip_instance,
+                                     const ShmemFunctor &f,
+                                     const unsigned tperb_reg) {
+  // translate LB from CUDA to HIP
+  const unsigned min_waves_per_eu =
+      LaunchBounds::minBperSM ? LaunchBounds::minBperSM : 1;
+  const unsigned min_threads_per_sm = min_waves_per_eu * HIPTraits::WarpSize;
+  const unsigned shmem_per_sm       = hip_instance->m_shmemPerSM;
+  unsigned block_size               = tperb_reg;
+  do {
+    unsigned total_shmem = f(block_size);
+    // find how many threads we can fit with this blocksize based on LDS usage
+    unsigned tperb_shmem = total_shmem > shmem_per_sm ? 0 : block_size;
+
+    // FIXME_HIP - could be if constexpr for c++17
+    if (BlockSize == BlockType::Max) {
+      // we want the maximum blocksize possible
+      // just wait until we get a case where we can fit the LDS per SM
+      if (tperb_shmem) return block_size;
+    } else {
+      if (block_size == tperb_reg && tperb_shmem >= tperb_reg) {
+        // fast path for exit on first iteration if registers are more limiting
+        // than LDS usage, just use the register limited size
+        return tperb_reg;
+      }
+      // otherwise we need to apply a heuristic to choose the blocksize
+      // the current launchbound selection scheme is:
+      //      1. If no spills, choose 1024 [MaxThreadsPerBlock]
+      //      2. Otherwise, choose 256 [ConservativeThreadsPerBlock]
+      //
+      // For blocksizes between 256 and 1024, we'll be forced to use the 1024 LB
+      // and we'll already have pretty decent occupancy, thus dropping to 256
+      // *probably* isn't a concern
+      const unsigned blocks_per_cu_shmem = shmem_per_sm / total_shmem;
+      const unsigned tperb = tperb_shmem < tperb_reg ? tperb_shmem : tperb_reg;
+
+      // for anything with > 4 WF's & can fit multiple blocks
+      // we're probably not occupancy limited so just return that
+      if (blocks_per_cu_shmem > 1 &&
+          tperb > HIPTraits::ConservativeThreadsPerBlock) {
+        return block_size;
+      }
+
+      // otherwise, it's probably better to drop to the first valid size that
+      // fits in the ConservativeThreadsPerBlock
+      if (tperb >= min_threads_per_sm) return block_size;
+    }
+    block_size >>= 1;
+  } while (block_size >= HIPTraits::WarpSize);
+  // TODO: return a negative, add an error to kernel launch
+  return 0;
+}
+
+// Standardized blocksize deduction for parallel constructs with no LDS usage
+// Returns the preferred blocksize as dictated by register usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds>
+unsigned hip_get_preferred_blocksize() {
+  return get_preferred_blocksize_impl<DriverType, LaunchBounds>();
+}
+
+// Standardized blocksize deduction for parallel constructs with no LDS usage
+// Returns the max blocksize as dictated by register usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds>
+unsigned hip_get_max_blocksize() {
+  return get_max_blocksize_impl<DriverType, LaunchBounds>();
+}
+
+// Standardized blocksize deduction for non-teams parallel constructs with LDS
+// usage Returns the 'preferred' blocksize, as determined by the heuristics in
+// hip_internal_get_block_size
+//
+// The ShmemFunctor takes a single argument of the current blocksize under
+// consideration, and returns the LDS usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds, typename ShmemFunctor>
+unsigned hip_get_preferred_blocksize(HIPInternal const *hip_instance,
+                                     ShmemFunctor const &f) {
+  // get preferred blocksize limited by register usage
+  const unsigned tperb_reg =
+      hip_get_preferred_blocksize<DriverType, LaunchBounds>();
+  return hip_internal_get_block_size<BlockType::Preferred, DriverType,
+                                     LaunchBounds>(hip_instance, f, tperb_reg);
+}
+
+// Standardized blocksize deduction for teams-based parallel constructs with LDS
+// usage Returns the 'preferred' blocksize, as determined by the heuristics in
+// hip_internal_get_block_size
+//
+// The ShmemTeamsFunctor takes two arguments: the hipFunctionAttributes and
+//  the current blocksize under consideration, and returns the LDS usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds,
+          typename ShmemTeamsFunctor>
+unsigned hip_get_preferred_team_blocksize(HIPInternal const *hip_instance,
+                                          ShmemTeamsFunctor const &f) {
+  hipFuncAttributes attr =
+      get_hip_func_attributes_impl<DriverType, LaunchBounds,
+                                   BlockType::Preferred>();
+  // get preferred blocksize limited by register usage
+  using namespace std::placeholders;
+  const unsigned tperb_reg =
+      hip_get_preferred_blocksize<DriverType, LaunchBounds>();
+  return hip_internal_get_block_size<BlockType::Preferred, DriverType,
+                                     LaunchBounds>(
+      hip_instance, std::bind(f, attr, _1), tperb_reg);
+}
+
+// Standardized blocksize deduction for non-teams parallel constructs with LDS
+// usage Returns the maximum possible blocksize, as determined by the heuristics
+// in hip_internal_get_block_size
+//
+// The ShmemFunctor takes a single argument of the current blocksize under
+// consideration, and returns the LDS usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds, typename ShmemFunctor>
+unsigned hip_get_max_blocksize(HIPInternal const *hip_instance,
+                               ShmemFunctor const &f) {
+  // get max blocksize limited by register usage
+  const unsigned tperb_reg = hip_get_max_blocksize<DriverType, LaunchBounds>();
+  return hip_internal_get_block_size<BlockType::Max, DriverType, LaunchBounds>(
+      hip_instance, f, tperb_reg);
 }
 
-template <typename FunctorType, typename LaunchBounds>
-int hip_get_opt_block_size(HIPInternal const *hip_instance,
-                           hipFuncAttributes const &attr, FunctorType const &f,
-                           size_t const vector_length, size_t const shmem_block,
-                           size_t const shmem_thread) {
-  return hip_internal_get_block_size<FunctorType, LaunchBounds>(
-      [](int) { return true; }, hip_instance, attr, f, vector_length,
-      shmem_block, shmem_thread);
+// Standardized blocksize deduction for teams-based parallel constructs with LDS
+// usage Returns the maximum possible blocksize, as determined by the heuristics
+// in hip_internal_get_block_size
+//
+// The ShmemTeamsFunctor takes two arguments: the hipFunctionAttributes and
+//  the current blocksize under consideration, and returns the LDS usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds,
+          typename ShmemTeamsFunctor>
+unsigned hip_get_max_team_blocksize(HIPInternal const *hip_instance,
+                                    ShmemTeamsFunctor const &f) {
+  hipFuncAttributes attr =
+      get_hip_func_attributes_impl<DriverType, LaunchBounds, BlockType::Max>();
+  // get max blocksize
+  using namespace std::placeholders;
+  const unsigned tperb_reg = hip_get_max_blocksize<DriverType, LaunchBounds>();
+  return hip_internal_get_block_size<BlockType::Max, DriverType, LaunchBounds>(
+      hip_instance, std::bind(f, attr, _1), tperb_reg);
 }
 
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp
index b3480bcad00c7ec6bc1a011a49fe7f9ae5eba345..a75e7a4a6c9351c0d39f7b2f7e8719a1a81c0adf 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp
@@ -66,12 +66,30 @@ inline void hip_internal_safe_call(hipError_t e, const char* name,
   }
 }
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+
+KOKKOS_DEPRECATED
+inline void hip_internal_safe_call_deprecated(hipError_t e, const char* name,
+                                              const char* file = nullptr,
+                                              const int line   = 0) {
+  hip_internal_safe_call(e, name, file, line);
+}
+
+#endif
+
 }  // namespace Impl
 }  // namespace Kokkos
 
-#define HIP_SAFE_CALL(call) \
+#define KOKKOS_IMPL_HIP_SAFE_CALL(call) \
   Kokkos::Impl::hip_internal_safe_call(call, #call, __FILE__, __LINE__)
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+#define HIP_SAFE_CALL(call)                                              \
+  Kokkos::Impl::hip_internal_safe_call_deprecated(call, #call, __FILE__, \
+                                                  __LINE__)
+
+#endif
+
 namespace Kokkos {
 namespace Experimental {
 
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
index 18ef10e22cd39b30118f78882a3ce747c19b9901..336ac8c6987c6538836f49792c41fd5520d0af8a 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
@@ -77,7 +77,7 @@ class HIPInternalDevices {
 };
 
 HIPInternalDevices::HIPInternalDevices() {
-  HIP_SAFE_CALL(hipGetDeviceCount(&m_hipDevCount));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&m_hipDevCount));
 
   if (m_hipDevCount > MAXIMUM_DEVICE_COUNT) {
     Kokkos::abort(
@@ -85,7 +85,7 @@ HIPInternalDevices::HIPInternalDevices() {
         "have. Please report this to github.com/kokkos/kokkos.");
   }
   for (int i = 0; i < m_hipDevCount; ++i) {
-    HIP_SAFE_CALL(hipGetDeviceProperties(m_hipProp + i, i));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceProperties(m_hipProp + i, i));
   }
 }
 
@@ -95,6 +95,9 @@ const HIPInternalDevices &HIPInternalDevices::singleton() {
 }
 }  // namespace
 
+unsigned long *Impl::HIPInternal::constantMemHostStaging = nullptr;
+hipEvent_t Impl::HIPInternal::constantMemReusable        = nullptr;
+
 namespace Impl {
 
 //----------------------------------------------------------------------------
@@ -154,6 +157,9 @@ int HIPInternal::verify_is_initialized(const char *const label) const {
   return 0 <= m_hipDev;
 }
 
+uint32_t HIPInternal::impl_get_instance_id() const noexcept {
+  return m_instance_id;
+}
 HIPInternal &HIPInternal::singleton() {
   static HIPInternal *self = nullptr;
   if (!self) {
@@ -163,12 +169,23 @@ HIPInternal &HIPInternal::singleton() {
 }
 
 void HIPInternal::fence() const {
-  HIP_SAFE_CALL(hipStreamSynchronize(m_stream));
-  // can reset our cycle id now as well
-  m_cycleId = 0;
+  fence("Kokkos::HIPInternal::fence: Unnamed Internal Fence");
+}
+void HIPInternal::fence(const std::string &name) const {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::HIP>(
+      name,
+      Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{
+          impl_get_instance_id()},
+      [&]() {
+        KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(m_stream));
+        // can reset our cycle id now as well
+        m_cycleId = 0;
+      });
 }
 
-void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
+void HIPInternal::initialize(int hip_device_id, hipStream_t stream,
+                             bool manage_stream) {
   if (was_finalized)
     Kokkos::abort("Calling HIP::initialize after HIP::finalize is illegal\n");
 
@@ -197,9 +214,10 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
     m_hipDev     = hip_device_id;
     m_deviceProp = hipProp;
 
-    HIP_SAFE_CALL(hipSetDevice(m_hipDev));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipSetDevice(m_hipDev));
 
     m_stream                    = stream;
+    m_manage_stream             = manage_stream;
     m_team_scratch_current_size = 0;
     m_team_scratch_ptr          = nullptr;
 
@@ -222,7 +240,7 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
     // theoretically, we can get 40 WF's / CU, but only can sustain 32
     // see
     // https://github.com/ROCm-Developer-Tools/HIP/blob/a0b5dfd625d99af7e288629747b40dd057183173/vdi/hip_platform.cpp#L742
-    m_maxBlocksPerSM = 32;
+    m_maxWavesPerCU = 32;
     // FIXME_HIP - Nick to implement this upstream
     //             Register count comes from Sec. 2.2. "Data Sharing" of the
     //             Vega 7nm ISA document (see the diagram)
@@ -232,7 +250,7 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
     m_regsPerSM        = 65536;
     m_shmemPerSM       = hipProp.maxSharedMemoryPerMultiProcessor;
     m_maxShmemPerBlock = hipProp.sharedMemPerBlock;
-    m_maxThreadsPerSM  = m_maxBlocksPerSM * HIPTraits::WarpSize;
+    m_maxThreadsPerSM  = m_maxWavesPerCU * HIPTraits::WarpSize;
     //----------------------------------
     // Multiblock reduction uses scratch flags for counters
     // and scratch space for partial reduction values.
@@ -265,8 +283,8 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
 
       m_scratchConcurrentBitset = reinterpret_cast<uint32_t *>(r->data());
 
-      HIP_SAFE_CALL(hipMemset(m_scratchConcurrentBitset, 0,
-                              sizeof(uint32_t) * buffer_bound));
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipMemset(m_scratchConcurrentBitset, 0,
+                                          sizeof(uint32_t) * buffer_bound));
     }
     //----------------------------------
 
@@ -287,6 +305,15 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
 
   // Init the array for used for arbitrarily sized atomics
   if (m_stream == nullptr) ::Kokkos::Impl::initialize_host_hip_lock_arrays();
+
+  // Allocate a staging buffer for constant mem in pinned host memory
+  // and an event to avoid overwriting driver for previous kernel launches
+  if (m_stream == nullptr) {
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipHostMalloc((void **)&constantMemHostStaging,
+                                            HIPTraits::ConstantMemoryUsage));
+
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipEventCreate(&constantMemReusable));
+  }
 }
 
 //----------------------------------------------------------------------------
@@ -339,7 +366,7 @@ Kokkos::Experimental::HIP::size_type *HIPInternal::scratch_flags(
 
     m_scratchFlags = reinterpret_cast<size_type *>(r->data());
 
-    HIP_SAFE_CALL(
+    KOKKOS_IMPL_HIP_SAFE_CALL(
         hipMemset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain));
   }
 
@@ -365,7 +392,7 @@ void *HIPInternal::resize_team_scratch_space(std::int64_t bytes,
 //----------------------------------------------------------------------------
 
 void HIPInternal::finalize() {
-  this->fence();
+  this->fence("Kokkos::HIPInternal::finalize: fence on finalization");
   was_finalized = true;
   if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) {
     using RecordHIP =
@@ -378,6 +405,9 @@ void HIPInternal::finalize() {
     if (m_team_scratch_current_size > 0)
       Kokkos::kokkos_free<Kokkos::Experimental::HIPSpace>(m_team_scratch_ptr);
 
+    if (m_manage_stream && m_stream != nullptr)
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(m_stream));
+
     m_hipDev                    = -1;
     m_hipArch                   = -1;
     m_multiProcCount            = 0;
@@ -395,28 +425,36 @@ void HIPInternal::finalize() {
     m_team_scratch_ptr          = nullptr;
   }
   if (nullptr != d_driverWorkArray) {
-    HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
     d_driverWorkArray = nullptr;
   }
+
+  // only destroy these if we're finalizing the singleton
+  if (this == &singleton()) {
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(constantMemHostStaging));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipEventDestroy(constantMemReusable));
+  }
 }
 
 char *HIPInternal::get_next_driver(size_t driverTypeSize) const {
   std::lock_guard<std::mutex> const lock(m_mutexWorkArray);
   if (d_driverWorkArray == nullptr) {
-    HIP_SAFE_CALL(
+    KOKKOS_IMPL_HIP_SAFE_CALL(
         hipHostMalloc(&d_driverWorkArray,
                       m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char),
                       hipHostMallocNonCoherent));
   }
   if (driverTypeSize > m_maxDriverTypeSize) {
     // fence handles the cycle id reset for us
-    fence();
-    HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
+    fence(
+        "Kokkos::HIPInternal::get_next_driver: fence before reallocating "
+        "resources");
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
     m_maxDriverTypeSize = driverTypeSize;
     if (m_maxDriverTypeSize % 128 != 0)
       m_maxDriverTypeSize =
           m_maxDriverTypeSize + 128 - m_maxDriverTypeSize % 128;
-    HIP_SAFE_CALL(
+    KOKKOS_IMPL_HIP_SAFE_CALL(
         hipHostMalloc(&d_driverWorkArray,
                       m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char),
                       hipHostMallocNonCoherent));
@@ -424,7 +462,9 @@ char *HIPInternal::get_next_driver(size_t driverTypeSize) const {
     m_cycleId = (m_cycleId + 1) % m_maxDriverCycles;
     if (m_cycleId == 0) {
       // ensure any outstanding kernels are completed before we wrap around
-      fence();
+      fence(
+          "Kokkos::HIPInternal::get_next_driver: fence before reusing first "
+          "driver");
     }
   }
   return &d_driverWorkArray[m_maxDriverTypeSize * m_cycleId];
@@ -462,7 +502,14 @@ Kokkos::Experimental::HIP::size_type *hip_internal_scratch_flags(
 
 namespace Kokkos {
 namespace Impl {
-void hip_device_synchronize() { HIP_SAFE_CALL(hipDeviceSynchronize()); }
+void hip_device_synchronize(const std::string &name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::HIP>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); });
+}
 
 void hip_internal_error_throw(hipError_t e, const char *name, const char *file,
                               const int line) {
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp
index f4f88628e313a2d22d23a09e4ce25630d242a566..967c6fdd4be63e11b00c6b7f97b8d3d0b27bbcfc 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp
@@ -48,6 +48,7 @@
 #define KOKKOS_HIP_INSTANCE_HPP
 
 #include <Kokkos_HIP_Space.hpp>
+#include <HIP/Kokkos_HIP_Error.hpp>
 
 #include <mutex>
 
@@ -59,10 +60,12 @@ struct HIPTraits {
   static int constexpr WarpSize       = 64;
   static int constexpr WarpIndexMask  = 0x003f; /* hexadecimal for 63 */
   static int constexpr WarpIndexShift = 6;      /* WarpSize == 1 << WarpShift*/
+  static int constexpr ConservativeThreadsPerBlock =
+      256;  // conservative fallback blocksize in case of spills
   static int constexpr MaxThreadsPerBlock =
-      1024;  // FIXME_HIP -- assumed constant for now
-
+      1024;  // the maximum we can fit in a block
   static int constexpr ConstantMemoryUsage        = 0x008000; /* 32k bytes */
+  static int constexpr KernelArgumentLimit        = 0x001000; /*  4k bytes */
   static int constexpr ConstantMemoryUseThreshold = 0x000200; /* 512 bytes */
 };
 
@@ -90,7 +93,7 @@ class HIPInternal {
   unsigned m_multiProcCount = 0;
   unsigned m_maxWarpCount   = 0;
   unsigned m_maxBlock       = 0;
-  unsigned m_maxBlocksPerSM = 0;
+  unsigned m_maxWavesPerCU  = 0;
   unsigned m_maxSharedWords = 0;
   int m_regsPerSM;
   int m_shmemPerSM       = 0;
@@ -108,6 +111,8 @@ class HIPInternal {
   mutable int m_cycleId = 0;
   // mutex to access d_driverWorkArray
   mutable std::mutex m_mutexWorkArray;
+  // mutex to access shared memory
+  mutable std::mutex m_mutexSharedMemory;
 
   // Scratch Spaces for Reductions
   size_type m_scratchSpaceCount = 0;
@@ -119,7 +124,10 @@ class HIPInternal {
 
   hipDeviceProp_t m_deviceProp;
 
-  hipStream_t m_stream = nullptr;
+  hipStream_t m_stream   = nullptr;
+  uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance<
+      Kokkos::Experimental::HIP>(reinterpret_cast<uintptr_t>(this));
+  bool m_manage_stream = false;
 
   // Team Scratch Level 1 Space
   mutable int64_t m_team_scratch_current_size = 0;
@@ -128,18 +136,25 @@ class HIPInternal {
 
   bool was_finalized = false;
 
+  // FIXME_HIP: these want to be per-device, not per-stream...  use of 'static'
+  // here will break once there are multiple devices though
+  static unsigned long *constantMemHostStaging;
+  static hipEvent_t constantMemReusable;
+
   static HIPInternal &singleton();
 
   int verify_is_initialized(const char *const label) const;
 
   int is_initialized() const { return m_hipDev >= 0; }
 
-  void initialize(int hip_device_id, hipStream_t stream = nullptr);
+  void initialize(int hip_device_id, hipStream_t stream = nullptr,
+                  bool manage_stream = false);
   void finalize();
 
   void print_configuration(std::ostream &) const;
 
   void fence() const;
+  void fence(const std::string &) const;
 
   // returns the next driver type pointer in our work array
   char *get_next_driver(size_t driverTypeSize) const;
@@ -151,13 +166,52 @@ class HIPInternal {
   // Resizing of reduction related scratch spaces
   size_type *scratch_space(const size_type size);
   size_type *scratch_flags(const size_type size);
-
+  uint32_t impl_get_instance_id() const noexcept;
   // Resizing of team level 1 scratch
   void *resize_team_scratch_space(std::int64_t bytes,
                                   bool force_shrink = false);
 };
 
 }  // namespace Impl
+
+// Partitioning an Execution Space: expects space and integer arguments for
+// relative weight
+//   Customization point for backends
+//   Default behavior is to return the passed in instance
+
+namespace Impl {
+inline void create_HIP_instances(std::vector<HIP> &instances) {
+  for (int s = 0; s < int(instances.size()); s++) {
+    hipStream_t stream;
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream));
+    instances[s] = HIP(stream, true);
+  }
+}
+}  // namespace Impl
+
+template <class... Args>
+std::vector<HIP> partition_space(const HIP &, Args...) {
+#ifdef __cpp_fold_expressions
+  static_assert(
+      (... && std::is_arithmetic_v<Args>),
+      "Kokkos Error: partitioning arguments must be integers or floats");
+#endif
+
+  std::vector<HIP> instances(sizeof...(Args));
+  Impl::create_HIP_instances(instances);
+  return instances;
+}
+
+template <class T>
+std::vector<HIP> partition_space(const HIP &, std::vector<T> &weights) {
+  static_assert(
+      std::is_arithmetic<T>::value,
+      "Kokkos Error: partitioning arguments must be integers or floats");
+
+  std::vector<HIP> instances(weights.size());
+  Impl::create_HIP_instances(instances);
+  return instances;
+}
 }  // namespace Experimental
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp
index f774423b378b0753a98c9e4df512b599910028dd..f209edf7c04ecc9b0001c4527e1bcebc0f24b256 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp
@@ -52,6 +52,7 @@
 #include <HIP/Kokkos_HIP_Error.hpp>
 #include <HIP/Kokkos_HIP_Instance.hpp>
 #include <Kokkos_HIP_Space.hpp>
+#include <HIP/Kokkos_HIP_Locks.hpp>
 
 // Must use global variable on the device with HIP-Clang
 #ifdef __HIP__
@@ -64,7 +65,7 @@ namespace Kokkos {
 namespace Experimental {
 template <typename T>
 inline __device__ T *kokkos_impl_hip_shared_memory() {
-  HIP_DYNAMIC_SHARED(HIPSpace::size_type, sh);
+  extern __shared__ Kokkos::Experimental::HIPSpace::size_type sh[];
   return (T *)sh;
 }
 }  // namespace Experimental
@@ -74,10 +75,12 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
+// The hip_parallel_launch_*_memory code is identical to the cuda code
 template <typename DriverType>
 __global__ static void hip_parallel_launch_constant_memory() {
   const DriverType &driver = *(reinterpret_cast<const DriverType *>(
       kokkos_impl_hip_constant_memory_buffer));
+
   driver();
 }
 
@@ -87,12 +90,13 @@ __global__ __launch_bounds__(
   const DriverType &driver = *(reinterpret_cast<const DriverType *>(
       kokkos_impl_hip_constant_memory_buffer));
 
-  driver->operator()();
+  driver();
 }
 
 template <class DriverType>
 __global__ static void hip_parallel_launch_local_memory(
     const DriverType *driver) {
+  // FIXME_HIP driver() pass by copy
   driver->operator()();
 }
 
@@ -101,6 +105,21 @@ __global__ __launch_bounds__(
     maxTperB,
     minBperSM) static void hip_parallel_launch_local_memory(const DriverType
                                                                 *driver) {
+  // FIXME_HIP driver() pass by copy
+  driver->operator()();
+}
+
+template <typename DriverType>
+__global__ static void hip_parallel_launch_global_memory(
+    const DriverType *driver) {
+  driver->operator()();
+}
+
+template <typename DriverType, unsigned int maxTperB, unsigned int minBperSM>
+__global__ __launch_bounds__(
+    maxTperB,
+    minBperSM) static void hip_parallel_launch_global_memory(const DriverType
+                                                                 *driver) {
   driver->operator()();
 }
 
@@ -127,33 +146,238 @@ struct HIPDispatchProperties {
   HIPLaunchMechanism launch_mechanism = l;
 };
 
+// Use local memory up to ConstantMemoryUseThreshold
+// Use global memory above ConstantMemoryUsage
+// In between use ConstantMemory
+// The following code is identical to the cuda code
+template <typename DriverType>
+struct DeduceHIPLaunchMechanism {
+  static constexpr Kokkos::Experimental::WorkItemProperty::HintLightWeight_t
+      light_weight = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
+  static constexpr Kokkos::Experimental::WorkItemProperty::HintHeavyWeight_t
+      heavy_weight = Kokkos::Experimental::WorkItemProperty::HintHeavyWeight;
+  static constexpr typename DriverType::Policy::work_item_property property =
+      typename DriverType::Policy::work_item_property();
+
+  static constexpr HIPLaunchMechanism valid_launch_mechanism =
+      // BuildValidMask
+      (sizeof(DriverType) < HIPTraits::KernelArgumentLimit
+           ? HIPLaunchMechanism::LocalMemory
+           : HIPLaunchMechanism::Default) |
+      (sizeof(DriverType) < HIPTraits::ConstantMemoryUsage
+           ? HIPLaunchMechanism::ConstantMemory
+           : HIPLaunchMechanism::Default) |
+      HIPLaunchMechanism::GlobalMemory;
+
+  static constexpr HIPLaunchMechanism requested_launch_mechanism =
+      (((property & light_weight) == light_weight)
+           ? HIPLaunchMechanism::LocalMemory
+           : HIPLaunchMechanism::ConstantMemory) |
+      HIPLaunchMechanism::GlobalMemory;
+
+  static constexpr HIPLaunchMechanism default_launch_mechanism =
+      // BuildValidMask
+      (sizeof(DriverType) < HIPTraits::ConstantMemoryUseThreshold)
+          ? HIPLaunchMechanism::LocalMemory
+          : ((sizeof(DriverType) < HIPTraits::ConstantMemoryUsage)
+                 ? HIPLaunchMechanism::ConstantMemory
+                 : HIPLaunchMechanism::GlobalMemory);
+
+  //              None                LightWeight    HeavyWeight
+  // F<UseT       LCG  LCG L  L       LCG  LG L  L   LCG  CG L  C
+  // UseT<F<KAL   LCG  LCG C  C       LCG  LG C  L   LCG  CG C  C
+  // Kal<F<CMU     CG  LCG C  C        CG  LG C  G    CG  CG C  C
+  // CMU<F          G  LCG G  G         G  LG G  G     G  CG G  G
+  static constexpr HIPLaunchMechanism launch_mechanism =
+      ((property & light_weight) == light_weight)
+          ? (sizeof(DriverType) < HIPTraits::KernelArgumentLimit
+                 ? HIPLaunchMechanism::LocalMemory
+                 : HIPLaunchMechanism::GlobalMemory)
+          : (((property & heavy_weight) == heavy_weight)
+                 ? (sizeof(DriverType) < HIPTraits::ConstantMemoryUsage
+                        ? HIPLaunchMechanism::ConstantMemory
+                        : HIPLaunchMechanism::GlobalMemory)
+                 : (default_launch_mechanism));
+};
+
+template <typename DriverType, typename LaunchBounds,
+          HIPLaunchMechanism LaunchMechanism>
+struct HIPParallelLaunchKernelFuncData {
+  static unsigned int get_scratch_size(
+      hipFuncAttributes const &hip_func_attributes) {
+    return hip_func_attributes.localSizeBytes;
+  }
+
+  static hipFuncAttributes get_hip_func_attributes(void const *kernel_func) {
+    static hipFuncAttributes attr = [=]() {
+      hipFuncAttributes attr;
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipFuncGetAttributes(&attr, kernel_func));
+      return attr;
+    }();
+    return attr;
+  }
+};
+
+//---------------------------------------------------------------//
+// HIPParallelLaunchKernelFunc structure and its specializations //
+//---------------------------------------------------------------//
 template <typename DriverType, typename LaunchBounds,
           HIPLaunchMechanism LaunchMechanism>
 struct HIPParallelLaunchKernelFunc;
 
+// HIPLaunchMechanism::LocalMemory specializations
 template <typename DriverType, unsigned int MaxThreadsPerBlock,
           unsigned int MinBlocksPerSM>
 struct HIPParallelLaunchKernelFunc<
     DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
     HIPLaunchMechanism::LocalMemory> {
+  using funcdata_t = HIPParallelLaunchKernelFuncData<
+      DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+      HIPLaunchMechanism::LocalMemory>;
   static auto get_kernel_func() {
     return hip_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
                                             MinBlocksPerSM>;
   }
+
+  static constexpr auto default_launchbounds() { return false; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
 };
 
 template <typename DriverType>
 struct HIPParallelLaunchKernelFunc<DriverType, Kokkos::LaunchBounds<0, 0>,
                                    HIPLaunchMechanism::LocalMemory> {
+  using funcdata_t =
+      HIPParallelLaunchKernelFuncData<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                      HIPLaunchMechanism::LocalMemory>;
+  static auto get_kernel_func() {
+    return HIPParallelLaunchKernelFunc<
+        DriverType, Kokkos::LaunchBounds<HIPTraits::MaxThreadsPerBlock, 1>,
+        HIPLaunchMechanism::LocalMemory>::get_kernel_func();
+  }
+
+  static constexpr auto default_launchbounds() { return true; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
+};
+
+// HIPLaunchMechanism::GlobalMemory specializations
+template <typename DriverType, unsigned int MaxThreadsPerBlock,
+          unsigned int MinBlocksPerSM>
+struct HIPParallelLaunchKernelFunc<
+    DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+    HIPLaunchMechanism::GlobalMemory> {
+  using funcdata_t = HIPParallelLaunchKernelFuncData<
+      DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+      HIPLaunchMechanism::GlobalMemory>;
+  static auto get_kernel_func() {
+    return hip_parallel_launch_global_memory<DriverType, MaxThreadsPerBlock,
+                                             MinBlocksPerSM>;
+  }
+
+  static constexpr auto default_launchbounds() { return false; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
+};
+
+template <typename DriverType>
+struct HIPParallelLaunchKernelFunc<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                   HIPLaunchMechanism::GlobalMemory> {
+  using funcdata_t =
+      HIPParallelLaunchKernelFuncData<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                      HIPLaunchMechanism::GlobalMemory>;
+  static auto get_kernel_func() {
+    return hip_parallel_launch_global_memory<DriverType>;
+  }
+
+  static constexpr auto default_launchbounds() { return true; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
+};
+
+// HIPLaunchMechanism::ConstantMemory specializations
+template <typename DriverType, unsigned int MaxThreadsPerBlock,
+          unsigned int MinBlocksPerSM>
+struct HIPParallelLaunchKernelFunc<
+    DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+    HIPLaunchMechanism::ConstantMemory> {
+  using funcdata_t = HIPParallelLaunchKernelFuncData<
+      DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+      HIPLaunchMechanism::ConstantMemory>;
   static auto get_kernel_func() {
-    return hip_parallel_launch_local_memory<DriverType, 1024, 1>;
+    return hip_parallel_launch_constant_memory<DriverType, MaxThreadsPerBlock,
+                                               MinBlocksPerSM>;
+  }
+
+  static constexpr auto default_launchbounds() { return false; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
   }
 };
 
+template <typename DriverType>
+struct HIPParallelLaunchKernelFunc<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                   HIPLaunchMechanism::ConstantMemory> {
+  using funcdata_t =
+      HIPParallelLaunchKernelFuncData<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                      HIPLaunchMechanism::ConstantMemory>;
+  static auto get_kernel_func() {
+    return hip_parallel_launch_constant_memory<DriverType>;
+  }
+  static constexpr auto default_launchbounds() { return true; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
+};
+
+//------------------------------------------------------------------//
+// HIPParallelLaunchKernelInvoker structure and its specializations //
+//------------------------------------------------------------------//
 template <typename DriverType, typename LaunchBounds,
           HIPLaunchMechanism LaunchMechanism>
 struct HIPParallelLaunchKernelInvoker;
 
+// HIPLaunchMechanism::LocalMemory specialization
 template <typename DriverType, typename LaunchBounds>
 struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
                                       HIPLaunchMechanism::LocalMemory>
@@ -170,21 +394,83 @@ struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
   }
 };
 
+// HIPLaunchMechanism::GlobalMemory specialization
+template <typename DriverType, typename LaunchBounds>
+struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
+                                      HIPLaunchMechanism::GlobalMemory>
+    : HIPParallelLaunchKernelFunc<DriverType, LaunchBounds,
+                                  HIPLaunchMechanism::GlobalMemory> {
+  using base_t = HIPParallelLaunchKernelFunc<DriverType, LaunchBounds,
+                                             HIPLaunchMechanism::GlobalMemory>;
+
+  // FIXME_HIP the code is different than cuda because driver cannot be passed
+  // by copy
+  static void invoke_kernel(DriverType const *driver, dim3 const &grid,
+                            dim3 const &block, int shmem,
+                            HIPInternal const *hip_instance) {
+    (base_t::get_kernel_func())<<<grid, block, shmem, hip_instance->m_stream>>>(
+        driver);
+  }
+};
+
+// HIPLaunchMechanism::ConstantMemory specializations
+template <typename DriverType, typename LaunchBounds>
+struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
+                                      HIPLaunchMechanism::ConstantMemory>
+    : HIPParallelLaunchKernelFunc<DriverType, LaunchBounds,
+                                  HIPLaunchMechanism::ConstantMemory> {
+  using base_t =
+      HIPParallelLaunchKernelFunc<DriverType, LaunchBounds,
+                                  HIPLaunchMechanism::ConstantMemory>;
+  static_assert(sizeof(DriverType) < HIPTraits::ConstantMemoryUsage,
+                "Kokkos Error: Requested HIPLaunchConstantMemory with a "
+                "Functor larger than 32kB.");
+
+  static void invoke_kernel(DriverType const *driver, dim3 const &grid,
+                            dim3 const &block, int shmem,
+                            HIPInternal const *hip_instance) {
+    // Wait until the previous kernel that uses the constant buffer is done
+    KOKKOS_IMPL_HIP_SAFE_CALL(
+        hipEventSynchronize(hip_instance->constantMemReusable));
+
+    // Copy functor (synchronously) to staging buffer in pinned host memory
+    unsigned long *staging = hip_instance->constantMemHostStaging;
+    std::memcpy((void *)staging, (void *)driver, sizeof(DriverType));
+
+    // Copy functor asynchronously from there to constant memory on the device
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyToSymbolAsync(
+        HIP_SYMBOL(kokkos_impl_hip_constant_memory_buffer), staging,
+        sizeof(DriverType), 0, hipMemcpyHostToDevice, hip_instance->m_stream));
+
+    // Invoke the driver function on the device
+    (base_t::
+         get_kernel_func())<<<grid, block, shmem, hip_instance->m_stream>>>();
+
+    // Record an event that says when the constant buffer can be reused
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipEventRecord(hip_instance->constantMemReusable,
+                                             hip_instance->m_stream));
+  }
+};
+
+//-----------------------------//
+// HIPParallelLaunch structure //
+//-----------------------------//
 template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
-          HIPLaunchMechanism LaunchMechanism = HIPLaunchMechanism::LocalMemory>
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
 struct HIPParallelLaunch;
 
 template <typename DriverType, unsigned int MaxThreadsPerBlock,
-          unsigned int MinBlocksPerSM>
+          unsigned int MinBlocksPerSM, HIPLaunchMechanism LaunchMechanism>
 struct HIPParallelLaunch<
     DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
-    HIPLaunchMechanism::LocalMemory>
+    LaunchMechanism>
     : HIPParallelLaunchKernelInvoker<
           DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
-          HIPLaunchMechanism::LocalMemory> {
+          LaunchMechanism> {
   using base_t = HIPParallelLaunchKernelInvoker<
       DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
-      HIPLaunchMechanism::LocalMemory>;
+      LaunchMechanism>;
 
   HIPParallelLaunch(const DriverType &driver, const dim3 &grid,
                     const dim3 &block, const int shmem,
@@ -205,22 +491,48 @@ struct HIPParallelLaunch<
       base_t::invoke_kernel(d_driver, grid, block, shmem, hip_instance);
 
 #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
-      HIP_SAFE_CALL(hipGetLastError());
-      hip_instance->fence();
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipGetLastError());
+      hip_instance->fence(
+          "Kokkos::Experimental::Impl::HIParallelLaunch: Debug Only Check for "
+          "Execution Error");
 #endif
     }
   }
+};
 
-  static hipFuncAttributes get_hip_func_attributes() {
-    static hipFuncAttributes attr = []() {
-      hipFuncAttributes attr;
-      HIP_SAFE_CALL(hipFuncGetAttributes(
-          &attr, reinterpret_cast<void const *>(base_t::get_kernel_func())));
-      return attr;
-    }();
-    return attr;
+// convenience method to launch the correct kernel given the launch bounds et
+// al.
+template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
+void hip_parallel_launch(const DriverType &driver, const dim3 &grid,
+                         const dim3 &block, const int shmem,
+                         const HIPInternal *hip_instance,
+                         const bool prefer_shmem) {
+  // FIXME_HIP - could be if constexpr for c++17
+  if (!HIPParallelLaunch<DriverType, LaunchBounds,
+                         LaunchMechanism>::default_launchbounds()) {
+    // for user defined, we *always* honor the request
+    HIPParallelLaunch<DriverType, LaunchBounds, LaunchMechanism>(
+        driver, grid, block, shmem, hip_instance, prefer_shmem);
+  } else {
+    // we can do what we like
+    const unsigned flat_block_size = block.x * block.y * block.z;
+    if (flat_block_size <= HIPTraits::ConservativeThreadsPerBlock) {
+      // we have to use the large blocksize
+      HIPParallelLaunch<
+          DriverType,
+          Kokkos::LaunchBounds<HIPTraits::ConservativeThreadsPerBlock, 1>,
+          LaunchMechanism>(driver, grid, block, shmem, hip_instance,
+                           prefer_shmem);
+    } else {
+      HIPParallelLaunch<DriverType,
+                        Kokkos::LaunchBounds<HIPTraits::MaxThreadsPerBlock, 1>,
+                        LaunchMechanism>(driver, grid, block, shmem,
+                                         hip_instance, prefer_shmem);
+    }
   }
-};
+}
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp
index 4f5271b6f644605e24ab277a7b08b25ba8c2ea84..c4292d35eca793bc58d76ba20db4358f85810996 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp
@@ -84,11 +84,17 @@ namespace Impl {
 HIPLockArrays g_host_hip_lock_arrays = {nullptr, nullptr, 0};
 
 void initialize_host_hip_lock_arrays() {
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+  desul::Impl::init_lock_arrays();
+
+  DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE();
+#endif
+
   if (g_host_hip_lock_arrays.atomic != nullptr) return;
-  HIP_SAFE_CALL(hipMalloc(
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(
       &g_host_hip_lock_arrays.atomic,
       sizeof(std::int32_t) * (KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1)));
-  HIP_SAFE_CALL(hipMalloc(
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(
       &g_host_hip_lock_arrays.scratch,
       sizeof(std::int32_t) * (::Kokkos::Experimental::HIP::concurrency())));
 
@@ -103,10 +109,14 @@ void initialize_host_hip_lock_arrays() {
 }
 
 void finalize_host_hip_lock_arrays() {
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+  desul::Impl::finalize_lock_arrays();
+#endif
+
   if (g_host_hip_lock_arrays.atomic == nullptr) return;
-  HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.atomic));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.atomic));
   g_host_hip_lock_arrays.atomic = nullptr;
-  HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.scratch));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.scratch));
   g_host_hip_lock_arrays.scratch = nullptr;
   g_host_hip_lock_arrays.n       = 0;
 #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp
index f34f85f43b0bb2ac2b07d4149957f5027991395f..71b104c2e4b65aff7ab3b3688c0901d000e8d9d8 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp
@@ -51,6 +51,10 @@
 
 #include <HIP/Kokkos_HIP_Error.hpp>
 
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#include <desul/atomics/Lock_Array_HIP.hpp>
+#endif
+
 namespace Kokkos {
 namespace Impl {
 
@@ -147,7 +151,7 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
 #define KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()                 \
   {                                                             \
     if (::Kokkos::Impl::lock_array_copied == 0) {               \
-      HIP_SAFE_CALL(hipMemcpyToSymbol(                          \
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyToSymbol(              \
           HIP_SYMBOL(::Kokkos::Impl::g_device_hip_lock_arrays), \
           &::Kokkos::Impl::g_host_hip_lock_arrays,              \
           sizeof(::Kokkos::Impl::HIPLockArrays)));              \
@@ -155,6 +159,8 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
     ::Kokkos::Impl::lock_array_copied = 1;                      \
   }
 
+#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+
 #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
 #define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE()
 #else
@@ -162,6 +168,19 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
   KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()
 #endif
 
+#else
+
+#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
+#define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE()
+#else
+// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
+#define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() \
+  KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()         \
+  DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE()
+#endif
+
+#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */
+
 #endif /* defined( __HIPCC__ ) */
 
 #endif /* #ifndef KOKKOS_HIP_LOCKS_HPP */
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp
index ce1aff9586d25911104d17d53860409f3e73b10b..acb538e1cb3970bab9bafc2f44d3568b34c6c31f 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp
@@ -28,7 +28,8 @@ inline TileSizeProperties get_tile_size_properties<Kokkos::Experimental::HIP>(
       space.impl_internal_space_instance()->m_maxThreadsPerSM;
   properties.default_largest_tile_size = 16;
   properties.default_tile_size         = 4;
-  properties.max_total_tile_size       = 1024;
+  properties.max_total_tile_size =
+      Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock;
   return properties;
 }
 
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
index 35e7d6fb853ae9e4f245e0fe0c2a71f4f2d4d6c2..eae323dd913d7f20383e3afcf5f1264d013b7be1 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
@@ -81,6 +81,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
   }
 
   inline void execute() const {
+    using ClosureType =
+        ParallelFor<FunctorType, Policy, Kokkos::Experimental::HIP>;
     if (m_policy.m_num_tiles == 0) return;
     array_index_type const maxblocks = static_cast<array_index_type>(
         m_policy.space().impl_internal_space_instance()->m_maxBlock);
@@ -94,7 +96,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
                        block.y,
                    maxblocks),
           1);
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
           *this, grid, block, 0,
           m_policy.space().impl_internal_space_instance(), false);
     } else if (Policy::rank == 3) {
@@ -110,7 +113,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
           std::min((m_policy.m_upper[2] - m_policy.m_lower[2] + block.z - 1) /
                        block.z,
                    maxblocks));
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
           *this, grid, block, 0,
           m_policy.space().impl_internal_space_instance(), false);
     } else if (Policy::rank == 4) {
@@ -128,7 +132,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
           std::min((m_policy.m_upper[3] - m_policy.m_lower[3] + block.z - 1) /
                        block.z,
                    maxblocks));
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
           *this, grid, block, 0,
           m_policy.space().impl_internal_space_instance(), false);
     } else if (Policy::rank == 5) {
@@ -147,7 +152,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
           std::min((m_policy.m_upper[4] - m_policy.m_lower[4] + block.z - 1) /
                        block.z,
                    maxblocks));
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
           *this, grid, block, 0,
           m_policy.space().impl_internal_space_instance(), false);
     } else if (Policy::rank == 6) {
@@ -165,7 +171,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
                       std::min(static_cast<index_type>(m_policy.m_tile_end[4] *
                                                        m_policy.m_tile_end[5]),
                                static_cast<index_type>(maxblocks)));
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
           *this, grid, block, 0,
           m_policy.space().impl_internal_space_instance(), false);
     } else {
@@ -178,22 +185,18 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
       : m_functor(arg_functor), m_policy(arg_policy) {}
 
   template <typename Policy, typename Functor>
-  static int max_tile_size_product(const Policy& pol, const Functor&) {
+  static int max_tile_size_product(const Policy&, const Functor&) {
     using closure_type =
         ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
                     Kokkos::Experimental::HIP>;
-    hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type, LaunchBounds>::get_hip_func_attributes();
-    auto const& prop = pol.space().hip_device_prop();
-    // Limits due to registers/SM, MDRange doesn't have
-    // shared memory constraints
-    int const regs_per_sm        = prop.regsPerMultiprocessor;
-    int const regs_per_thread    = attr.numRegs;
-    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
-    return std::min(
-        max_threads_per_sm,
-        static_cast<int>(
-            Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock));
+    unsigned block_size =
+        Kokkos::Experimental::Impl::hip_get_max_blocksize<closure_type,
+                                                          LaunchBounds>();
+    if (block_size == 0)
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelFor< HIP > could not find a valid "
+                      "tile size."));
+    return block_size;
   }
 };
 
@@ -242,6 +245,9 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   const bool m_result_ptr_device_accessible;
   size_type* m_scratch_space;
   size_type* m_scratch_flags;
+  // Only let one Parallel/Scan modify the shared memory. The
+  // constructor acquires the mutex which is released in the destructor.
+  std::unique_lock<std::mutex> m_shared_memory_lock;
 
   using DeviceIteratePattern = typename Kokkos::Impl::Reduce::DeviceIterateTile<
       Policy::rank, Policy, FunctorType, WorkTag, reference_type>;
@@ -307,32 +313,30 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   // Determine block size constrained by shared memory:
   // This is copy/paste from Kokkos_HIP_Parallel_Range
   inline unsigned local_block_size(const FunctorType& f) {
-    unsigned int n =
-        ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock;
-    int shmem_size = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem<
-        false, FunctorType, WorkTag>(f, n);
-    using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>;
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type, LaunchBounds>::get_hip_func_attributes();
-    while (
-        (n &&
-         (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
-          shmem_size)) ||
-        (n >
-         static_cast<unsigned>(
-             ::Kokkos::Experimental::Impl::hip_get_max_block_size<FunctorType,
-                                                                  LaunchBounds>(
-                 m_policy.space().impl_internal_space_instance(), attr, f, 1,
-                 shmem_size, 0)))) {
-      n >>= 1;
-      shmem_size = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem<
-          false, FunctorType, WorkTag>(f, n);
+    const auto& instance = m_policy.space().impl_internal_space_instance();
+    auto shmem_functor   = [&f](unsigned n) {
+      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                      WorkTag>(f, n);
+    };
+    using closure_type = ParallelReduce<FunctorType, Policy, ReducerType,
+                                        Kokkos::Experimental::HIP>;
+
+    unsigned block_size =
+        Kokkos::Experimental::Impl::hip_get_preferred_blocksize<closure_type,
+                                                                LaunchBounds>(
+            instance, shmem_functor);
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a "
+                      "valid tile size."));
     }
-    return n;
+    return block_size;
   }
 
   inline void execute() {
-    const int nwork = m_policy.m_num_tiles;
+    using ClosureType = ParallelReduce<FunctorType, Policy, ReducerType,
+                                       Kokkos::Experimental::HIP>;
+    const int nwork   = m_policy.m_num_tiles;
     if (nwork) {
       int block_size = m_policy.m_prod_tile_dims;
       // CONSTRAINT: Algorithm requires block_size >= product of tile dimensions
@@ -366,14 +370,16 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
           ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem<
               false, FunctorType, WorkTag>(m_functor, block.y);
 
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelReduce,
-                                                    LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
           *this, grid, block, shmem,
           m_policy.space().impl_internal_space_instance(),
           false);  // copy to device and execute
 
       if (!m_result_ptr_device_accessible) {
-        m_policy.space().fence();
+        m_policy.space().fence(
+            "Kokkos::Impl::ParallelReduce<MDRangePolicy,HIP>: fence because "
+            "reduction can't access result storage location");
 
         if (m_result_ptr) {
           const int size = ValueTraits::value_size(
@@ -403,7 +409,10 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
             MemorySpaceAccess<Kokkos::Experimental::HIPSpace,
                               typename ViewType::memory_space>::accessible),
         m_scratch_space(nullptr),
-        m_scratch_flags(nullptr) {}
+        m_scratch_flags(nullptr),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}
 
   ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
                  const ReducerType& reducer)
@@ -416,23 +425,25 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                               typename ReducerType::result_view_type::
                                   memory_space>::accessible),
         m_scratch_space(nullptr),
-        m_scratch_flags(nullptr) {}
+        m_scratch_flags(nullptr),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}
+
   template <typename Policy, typename Functor>
-  static int max_tile_size_product(const Policy& pol, const Functor&) {
+  static int max_tile_size_product(const Policy&, const Functor&) {
     using closure_type =
         ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>,
                        ReducerType, Kokkos::Experimental::HIP>;
-    hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type, LaunchBounds>::get_hip_func_attributes();
-    auto const& prop = pol.space().hip_device_prop();
-    // Limits due do registers/SM
-    int const regs_per_sm        = prop.regsPerMultiprocessor;
-    int const regs_per_thread    = attr.numRegs;
-    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
-    return std::min(
-        max_threads_per_sm,
-        static_cast<int>(
-            Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock));
+    unsigned block_size =
+        Kokkos::Experimental::Impl::hip_get_max_blocksize<closure_type,
+                                                          LaunchBounds>();
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a "
+                      "valid tile size."));
+    }
+    return block_size;
   }
 };
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
index 7d2825eeb4c6be1d060d1e8d7c3eb67097729ccf..e02ead1e990151a30d0a87b280bada8c774ca5c5 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
@@ -108,16 +108,21 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
   inline void execute() const {
     const typename Policy::index_type nwork = m_policy.end() - m_policy.begin();
 
+    using DriverType =
+        ParallelFor<FunctorType, Policy, Kokkos::Experimental::HIP>;
     const int block_size =
-        LaunchBounds::maxTperB
-            ? LaunchBounds::maxTperB
-            : ::Kokkos::Experimental::Impl::HIPTraits::
-                  MaxThreadsPerBlock;  // FIXME_HIP Choose block_size better
+        Kokkos::Experimental::Impl::hip_get_preferred_blocksize<DriverType,
+                                                                LaunchBounds>();
     const dim3 block(1, block_size, 1);
     const dim3 grid(
         typename Policy::index_type((nwork + block.y - 1) / block.y), 1, 1);
 
-    Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelFor< HIP > could not find a "
+                      "valid execution configuration."));
+    }
+    Kokkos::Experimental::Impl::hip_parallel_launch<DriverType, LaunchBounds>(
         *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
         false);
   }
@@ -173,15 +178,12 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   const bool m_result_ptr_host_accessible;
   size_type* m_scratch_space = nullptr;
   size_type* m_scratch_flags = nullptr;
+  // Only let one ParallelReduce/Scan modify the shared memory. The
+  // constructor acquires the mutex which is released in the destructor.
+  std::unique_lock<std::mutex> m_shared_memory_lock;
 
-#if HIP_VERSION < 401
-  static bool constexpr UseShflReduction =
-      ((sizeof(value_type) > 2 * sizeof(double)) &&
-       static_cast<bool>(ValueTraits::StaticValueSize));
-#else
   static bool constexpr UseShflReduction =
       static_cast<bool>(ValueTraits::StaticValueSize);
-#endif
 
  private:
   struct ShflReductionTag {};
@@ -328,30 +330,15 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
 
   // Determine block size constrained by shared memory:
   inline unsigned local_block_size(const FunctorType& f) {
-    unsigned int n =
-        ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock;
-    int shmem_size =
-        hip_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
-            f, n);
-    using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>;
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type, LaunchBounds>::get_hip_func_attributes();
-    while (
-        (n &&
-         (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
-          shmem_size)) ||
-        (n >
-         static_cast<unsigned int>(
-             ::Kokkos::Experimental::Impl::hip_get_max_block_size<FunctorType,
-                                                                  LaunchBounds>(
-                 m_policy.space().impl_internal_space_instance(), attr, f, 1,
-                 shmem_size, 0)))) {
-      n >>= 1;
-      shmem_size =
-          hip_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
-              f, n);
-    }
-    return n;
+    const auto& instance = m_policy.space().impl_internal_space_instance();
+    auto shmem_functor   = [&f](unsigned n) {
+      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                      WorkTag>(f, n);
+    };
+    using DriverType = ParallelReduce<FunctorType, Policy, ReducerType,
+                                      Kokkos::Experimental::HIP>;
+    return Kokkos::Experimental::Impl::hip_get_preferred_blocksize<
+        DriverType, LaunchBounds>(instance, shmem_functor);
   }
 
   inline void execute() {
@@ -362,7 +349,11 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                                  !std::is_same<ReducerType, InvalidType>::value;
     if ((nwork > 0) || need_device_set) {
       const int block_size = local_block_size(m_functor);
-      KOKKOS_ASSERT(block_size > 0);
+      if (block_size == 0) {
+        Kokkos::Impl::throw_runtime_exception(
+            std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a "
+                        "valid execution configuration."));
+      }
 
       m_scratch_space =
           ::Kokkos::Experimental::Impl::hip_internal_scratch_space(
@@ -391,14 +382,17 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                                                          WorkTag>(m_functor,
                                                                   block.y);
 
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelReduce,
-                                                    LaunchBounds>(
+      using DriverType = ParallelReduce<FunctorType, Policy, ReducerType,
+                                        Kokkos::Experimental::HIP>;
+      Kokkos::Experimental::Impl::hip_parallel_launch<DriverType, LaunchBounds>(
           *this, grid, block, shmem,
           m_policy.space().impl_internal_space_instance(),
           false);  // copy to device and execute
 
       if (!m_result_ptr_device_accessible) {
-        m_policy.space().impl_internal_space_instance()->fence();
+        m_policy.space().impl_internal_space_instance()->fence(
+            "Kokkos::Impl::ParallelReduce<RangePolicy,HIP>: fence because "
+            "reduction can't access result storage location");
 
         if (m_result_ptr) {
           const int size = ValueTraits::value_size(
@@ -429,7 +423,10 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                               typename ViewType::memory_space>::accessible),
         m_result_ptr_host_accessible(
             MemorySpaceAccess<Kokkos::HostSpace,
-                              typename ViewType::memory_space>::accessible) {}
+                              typename ViewType::memory_space>::accessible),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}
 
   ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
                  const ReducerType& reducer)
@@ -444,7 +441,10 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
         m_result_ptr_host_accessible(
             MemorySpaceAccess<Kokkos::HostSpace,
                               typename ReducerType::result_view_type::
-                                  memory_space>::accessible) {}
+                                  memory_space>::accessible),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}
 };
 
 template <class FunctorType, class... Traits>
@@ -482,6 +482,9 @@ class ParallelScanHIPBase {
   size_type* m_scratch_flags = nullptr;
   size_type m_final          = false;
   int m_grid_x               = 0;
+  // Only let one ParallelReduce/Scan modify the shared memory. The
+  // constructor acquires the mutex which is released in the destructor.
+  std::unique_lock<std::mutex> m_shared_memory_lock;
 
  private:
   template <class TagType>
@@ -624,22 +627,7 @@ class ParallelScanHIPBase {
   }
 
   // Determine block size constrained by shared memory:
-  inline unsigned local_block_size(const FunctorType& f) {
-    // blockDim.y must be power of two = 128 (2 warps) or 256 (4 warps) or
-    // 512 (8 warps) gridDim.x <= blockDim.y * blockDim.y
-    //
-    // TODO check best option
-
-    unsigned n = Experimental::Impl::HIPTraits::WarpSize * 4;
-    while (n && static_cast<unsigned>(m_policy.space()
-                                          .impl_internal_space_instance()
-                                          ->m_maxShmemPerBlock) <
-                    hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                             WorkTag>(f, n)) {
-      n >>= 1;
-    }
-    return n;
-  }
+  virtual inline unsigned local_block_size(const FunctorType& f) = 0;
 
   inline void impl_execute() {
     const index_type nwork = m_policy.end() - m_policy.begin();
@@ -649,7 +637,11 @@ class ParallelScanHIPBase {
       const int gridMaxComputeCapability_2x = 0x01fff;
 
       const int block_size = static_cast<int>(local_block_size(m_functor));
-      KOKKOS_ASSERT(block_size > 0);
+      if (block_size == 0) {
+        Kokkos::Impl::throw_runtime_exception(
+            std::string("Kokkos::Impl::ParallelScan< HIP > could not find a "
+                        "valid execution configuration."));
+      }
 
       const int grid_max =
           std::min(block_size * block_size, gridMaxComputeCapability_2x);
@@ -674,15 +666,16 @@ class ParallelScanHIPBase {
       const int shmem = ValueTraits::value_size(m_functor) * (block_size + 2);
 
       m_final = false;
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelScanHIPBase,
-                                                    LaunchBounds>(
+      // these ones are OK to be just the base because the specializations
+      // do not modify the kernel at all
+      using DriverType = ParallelScanHIPBase<FunctorType, Traits...>;
+      Kokkos::Experimental::Impl::hip_parallel_launch<DriverType, LaunchBounds>(
           *this, grid, block, shmem,
           m_policy.space().impl_internal_space_instance(),
           false);  // copy to device and execute
 
       m_final = true;
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelScanHIPBase,
-                                                    LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<DriverType, LaunchBounds>(
           *this, grid, block, shmem,
           m_policy.space().impl_internal_space_instance(),
           false);  // copy to device and execute
@@ -690,13 +683,17 @@ class ParallelScanHIPBase {
   }
 
   ParallelScanHIPBase(const FunctorType& arg_functor, const Policy& arg_policy)
-      : m_functor(arg_functor), m_policy(arg_policy) {}
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}
 };
 
 template <class FunctorType, class... Traits>
 class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
                    Kokkos::Experimental::HIP>
-    : private ParallelScanHIPBase<FunctorType, Traits...> {
+    : public ParallelScanHIPBase<FunctorType, Traits...> {
  public:
   using Base = ParallelScanHIPBase<FunctorType, Traits...>;
   using Base::operator();
@@ -706,6 +703,23 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
   ParallelScan(const FunctorType& arg_functor,
                const typename Base::Policy& arg_policy)
       : Base(arg_functor, arg_policy) {}
+
+  inline unsigned local_block_size(const FunctorType& f) {
+    // blockDim.y must be power of two = 128 (2 warps) or 256 (4 warps) or
+    // 512 (8 warps) gridDim.x <= blockDim.y * blockDim.y
+
+    const auto& instance =
+        Base::m_policy.space().impl_internal_space_instance();
+    auto shmem_functor = [&f](unsigned n) {
+      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                      typename Base::WorkTag>(
+          f, n);
+    };
+    using DriverType = ParallelScan<FunctorType, typename Base::Policy,
+                                    Kokkos::Experimental::HIP>;
+    return Kokkos::Experimental::Impl::hip_get_preferred_blocksize<
+        DriverType, typename Base::LaunchBounds>(instance, shmem_functor);
+  }
 };
 
 //----------------------------------------------------------------------------
@@ -713,7 +727,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
 template <class FunctorType, class ReturnType, class... Traits>
 class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
                             ReturnType, Kokkos::Experimental::HIP>
-    : private ParallelScanHIPBase<FunctorType, Traits...> {
+    : public ParallelScanHIPBase<FunctorType, Traits...> {
  public:
   using Base = ParallelScanHIPBase<FunctorType, Traits...>;
   using Base::operator();
@@ -737,6 +751,24 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
                         const typename Base::Policy& arg_policy,
                         ReturnType& arg_returnvalue)
       : Base(arg_functor, arg_policy), m_returnvalue(arg_returnvalue) {}
+
+  inline unsigned local_block_size(const FunctorType& f) {
+    // blockDim.y must be power of two = 128 (2 warps) or 256 (4 warps) or
+    // 512 (8 warps) gridDim.x <= blockDim.y * blockDim.y
+
+    const auto& instance =
+        Base::m_policy.space().impl_internal_space_instance();
+    auto shmem_functor = [&f](unsigned n) {
+      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                      typename Base::WorkTag>(
+          f, n);
+    };
+    using DriverType =
+        ParallelScanWithTotal<FunctorType, typename Base::Policy, ReturnType,
+                              Kokkos::Experimental::HIP>;
+    return Kokkos::Experimental::Impl::hip_get_preferred_blocksize<
+        DriverType, typename Base::LaunchBounds>(instance, shmem_functor);
+  }
 };
 
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
index 96c3ff2a751027a4eb05b03c99487207c9acf708..b794f5bc037111a8774ed23d1181326d3fa23b51 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
@@ -56,20 +56,20 @@
 
 namespace Kokkos {
 namespace Impl {
+
 template <typename... Properties>
 class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
     : public PolicyTraits<Properties...> {
  public:
   using execution_policy = TeamPolicyInternal;
 
-  using traits = PolicyTraits<Properties...>;
+  using traits    = PolicyTraits<Properties...>;
+  using BlockType = Kokkos::Experimental::Impl::BlockType;
 
   template <typename ExecSpace, typename... OtherProperties>
   friend class TeamPolicyInternal;
 
  private:
-  static int constexpr MAX_WARP = 8;
-
   typename traits::execution_space m_space;
   int m_league_size;
   int m_team_size;
@@ -101,17 +101,9 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
   template <typename FunctorType>
   int team_size_max(FunctorType const& f, ParallelForTag const&) const {
     using closure_type =
-        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...> >;
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type,
-        typename traits::launch_bounds>::get_hip_func_attributes();
-    int const block_size = ::Kokkos::Experimental::Impl::hip_get_max_block_size<
-        FunctorType, typename traits::launch_bounds>(
-        space().impl_internal_space_instance(), attr, f,
-        static_cast<size_t>(impl_vector_length()),
-        static_cast<size_t>(team_scratch_size(0)) + 2 * sizeof(double),
-        static_cast<size_t>(thread_scratch_size(0)) + sizeof(double));
-    return block_size / impl_vector_length();
+        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
+
+    return internal_team_size_common<BlockType::Max, closure_type>(f);
   }
 
   template <class FunctorType>
@@ -129,8 +121,8 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
     return internal_team_size_max<closure_type>(f);
   }
 
-  template <class FunctorType, class ReducerType>
-  inline int team_size_max(const FunctorType& f, const ReducerType& /*r*/,
+  template <typename FunctorType, typename ReducerType>
+  inline int team_size_max(const FunctorType& f, const ReducerType&,
                            const ParallelReduceTag&) const {
     using closure_type =
         Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
@@ -141,17 +133,9 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
   template <typename FunctorType>
   int team_size_recommended(FunctorType const& f, ParallelForTag const&) const {
     using closure_type =
-        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...> >;
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type,
-        typename traits::launch_bounds>::get_hip_func_attributes();
-    int const block_size = ::Kokkos::Experimental::Impl::hip_get_opt_block_size<
-        FunctorType, typename traits::launch_bounds>(
-        space().impl_internal_space_instance(), attr, f,
-        static_cast<size_t>(impl_vector_length()),
-        static_cast<size_t>(team_scratch_size(0)) + 2 * sizeof(double),
-        static_cast<size_t>(thread_scratch_size(0)) + sizeof(double));
-    return block_size / impl_vector_length();
+        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
+
+    return internal_team_size_common<BlockType::Preferred, closure_type>(f);
   }
 
   template <typename FunctorType>
@@ -169,7 +153,7 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
     return internal_team_size_recommended<closure_type>(f);
   }
 
-  template <class FunctorType, class ReducerType>
+  template <typename FunctorType, typename ReducerType>
   int team_size_recommended(FunctorType const& f, ReducerType const&,
                             ParallelReduceTag const&) const {
     using closure_type =
@@ -177,6 +161,7 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
                              ReducerType>;
     return internal_team_size_recommended<closure_type>(f);
   }
+
   inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
   inline bool impl_auto_team_size() const { return m_tune_team_size; }
   static int vector_length_max() {
@@ -211,7 +196,10 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
   inline void impl_set_vector_length(size_t size) { m_vector_length = size; }
   inline void impl_set_team_size(size_t size) { m_team_size = size; }
   int impl_vector_length() const { return m_vector_length; }
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   KOKKOS_DEPRECATED int vector_length() const { return impl_vector_length(); }
+#endif
 
   int team_size() const { return m_team_size; }
 
@@ -266,7 +254,8 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
           "space.");
 
     // Make sure total block size is permissible
-    if (m_team_size * m_vector_length > 1024) {
+    if (m_team_size * m_vector_length >
+        ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock) {
       Impl::throw_runtime_exception(
           std::string("Kokkos::TeamPolicy< HIP > the team size is too large. "
                       "Team size x vector length must be smaller than 1024."));
@@ -363,26 +352,84 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
   using member_type = Kokkos::Impl::HIPTeamMember;
 
  protected:
-  template <class ClosureType, class FunctorType, class BlockSizeCallable>
-  int internal_team_size_common(const FunctorType& f,
-                                BlockSizeCallable&& block_size_callable) const {
-    using closure_type = ClosureType;
+  template <BlockType BlockSize, class ClosureType, class FunctorType>
+  int internal_team_size_common(const FunctorType& f) const {
+    // FIXME_HIP: this could be unified with the
+    // internal_team_size_common_reduce
+    //            once we can turn c++17 constexpr on by default.
+    //            The problem right now is that we can't turn off the evaluation
+    //            of the functor_value_traits's valuesize / StaticValueSize
+
+    const unsigned shmem_block  = team_scratch_size(0) + 2 * sizeof(double);
+    const unsigned shmem_thread = thread_scratch_size(0) + sizeof(double);
+    const int vector_length     = impl_vector_length();
+
+    const auto functor = [&f, shmem_block, shmem_thread, vector_length](
+                             const hipFuncAttributes& attr, int block_size) {
+      int functor_shmem =
+          ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
+              f, block_size / vector_length);
+      return shmem_block + shmem_thread * (block_size / vector_length) +
+             functor_shmem + attr.sharedSizeBytes;
+    };
+    int block_size;
+    // FIXME_HIP - could be if constexpr for c++17
+    if (BlockSize == BlockType::Max) {
+      block_size = ::Kokkos::Experimental::Impl::hip_get_max_team_blocksize<
+          ClosureType, typename traits::launch_bounds>(
+          space().impl_internal_space_instance(), functor);
+    } else {
+      block_size =
+          ::Kokkos::Experimental::Impl::hip_get_preferred_team_blocksize<
+              ClosureType, typename traits::launch_bounds>(
+              space().impl_internal_space_instance(), functor);
+    }
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelFor< HIP > could not find a valid "
+                      "team size."));
+    }
+    return block_size / impl_vector_length();
+  }
+
+  template <BlockType BlockSize, class ClosureType, class FunctorType>
+  int internal_team_size_common_reduce(const FunctorType& f) const {
     using functor_value_traits =
         Impl::FunctorValueTraits<FunctorType, typename traits::work_tag>;
 
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type,
-        typename traits::launch_bounds>::get_hip_func_attributes();
-    const int block_size = std::forward<BlockSizeCallable>(block_size_callable)(
-        space().impl_internal_space_instance(), attr, f,
-        static_cast<size_t>(impl_vector_length()),
-        static_cast<size_t>(team_scratch_size(0)) + 2 * sizeof(double),
-        static_cast<size_t>(thread_scratch_size(0)) + sizeof(double) +
-            ((functor_value_traits::StaticValueSize != 0)
-                 ? 0
-                 : functor_value_traits::value_size(f)));
-    KOKKOS_ASSERT(block_size > 0);
+    const unsigned shmem_block  = team_scratch_size(0) + 2 * sizeof(double);
+    const unsigned shmem_thread = thread_scratch_size(0) + sizeof(double) +
+                                  ((functor_value_traits::StaticValueSize != 0)
+                                       ? 0
+                                       : functor_value_traits::value_size(f));
+    const int vector_length = impl_vector_length();
+
+    const auto functor = [&f, shmem_block, shmem_thread, vector_length](
+                             const hipFuncAttributes& attr, int block_size) {
+      int functor_shmem =
+          ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
+              f, block_size / vector_length);
+      return shmem_block + shmem_thread * (block_size / vector_length) +
+             functor_shmem + attr.sharedSizeBytes;
+    };
+    int block_size;
+    // FIXME_HIP - could be if constexpr for c++17
+    if (BlockSize == BlockType::Max) {
+      block_size = ::Kokkos::Experimental::Impl::hip_get_max_team_blocksize<
+          ClosureType, typename traits::launch_bounds>(
+          space().impl_internal_space_instance(), functor);
+    } else {
+      block_size =
+          ::Kokkos::Experimental::Impl::hip_get_preferred_team_blocksize<
+              ClosureType, typename traits::launch_bounds>(
+              space().impl_internal_space_instance(), functor);
+    }
 
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a "
+                      "valid team size."));
+    }
     // Currently we require Power-of-2 team size for reductions.
     int p2 = 1;
     while (p2 <= block_size) p2 *= 2;
@@ -392,16 +439,13 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
 
   template <class ClosureType, class FunctorType>
   int internal_team_size_max(const FunctorType& f) const {
-    return internal_team_size_common<ClosureType>(
-        f, ::Kokkos::Experimental::Impl::hip_get_max_block_size<
-               FunctorType, typename traits::launch_bounds>);
+    return internal_team_size_common_reduce<BlockType::Max, ClosureType>(f);
   }
 
   template <class ClosureType, class FunctorType>
   int internal_team_size_recommended(const FunctorType& f) const {
-    return internal_team_size_common<ClosureType>(
-        f, ::Kokkos::Experimental::Impl::hip_get_opt_block_size<
-               FunctorType, typename traits::launch_bounds>);
+    return internal_team_size_common_reduce<BlockType::Preferred, ClosureType>(
+        f);
   }
 };
 
@@ -505,7 +549,11 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     dim3 const block(static_cast<int>(m_vector_size),
                      static_cast<int>(m_team_size), 1);
 
-    ::Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, launch_bounds>(
+    using closure_type =
+        ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
+                    Kokkos::Experimental::HIP>;
+    ::Kokkos::Experimental::Impl::hip_parallel_launch<closure_type,
+                                                      launch_bounds>(
         *this, grid, block, shmem_size_total,
         m_policy.space().impl_internal_space_instance(),
         true);  // copy to device and execute
@@ -520,17 +568,9 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_scratch_lock(m_policy.space()
                            .impl_internal_space_instance()
                            ->m_team_scratch_mutex) {
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        ParallelFor, launch_bounds>::get_hip_func_attributes();
-    m_team_size =
-        m_team_size >= 0
-            ? m_team_size
-            : ::Kokkos::Experimental::Impl::hip_get_opt_block_size<
-                  FunctorType, launch_bounds>(
-                  m_policy.space().impl_internal_space_instance(), attr,
-                  m_functor, m_vector_size, m_policy.team_scratch_size(0),
-                  m_policy.thread_scratch_size(0)) /
-                  m_vector_size;
+    m_team_size = m_team_size >= 0 ? m_team_size
+                                   : arg_policy.team_size_recommended(
+                                         arg_functor, ParallelForTag());
 
     m_shmem_begin = (sizeof(double) * (m_team_size + 2));
     m_shmem_size =
@@ -556,23 +596,12 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     int const shmem_size_total = m_shmem_begin + m_shmem_size;
     if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
         shmem_size_total) {
-      printf(
-          "%i %i\n",
-          m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock,
-          shmem_size_total);
       Kokkos::Impl::throw_runtime_exception(std::string(
           "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory"));
     }
 
-    if (static_cast<int>(m_team_size) >
-        static_cast<int>(
-            ::Kokkos::Experimental::Impl::hip_get_max_block_size<FunctorType,
-                                                                 launch_bounds>(
-                m_policy.space().impl_internal_space_instance(), attr,
-                arg_functor, arg_policy.impl_vector_length(),
-                arg_policy.team_scratch_size(0),
-                arg_policy.thread_scratch_size(0)) /
-            arg_policy.impl_vector_length())) {
+    size_t max_size = arg_policy.team_size_max(arg_functor, ParallelForTag());
+    if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) {
       Kokkos::Impl::throw_runtime_exception(std::string(
           "Kokkos::Impl::ParallelFor< HIP > requested too large team size."));
     }
@@ -839,8 +868,11 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
       }
       const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
 
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelReduce,
-                                                    launch_bounds>(
+      using closure_type =
+          ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
+                         ReducerType, Kokkos::Experimental::HIP>;
+      Kokkos::Experimental::Impl::hip_parallel_launch<closure_type,
+                                                      launch_bounds>(
           *this, grid, block, shmem_size_total,
           m_policy.space().impl_internal_space_instance(),
           true);  // copy to device and execute
@@ -890,17 +922,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_scratch_lock(m_policy.space()
                            .impl_internal_space_instance()
                            ->m_team_scratch_mutex) {
-    hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
-        ParallelReduce, launch_bounds>::get_hip_func_attributes();
-    m_team_size =
-        m_team_size >= 0
-            ? m_team_size
-            : Kokkos::Experimental::Impl::hip_get_opt_block_size<FunctorType,
-                                                                 launch_bounds>(
-                  m_policy.space().impl_internal_space_instance(), attr,
-                  m_functor, m_vector_size, m_policy.team_scratch_size(0),
-                  m_policy.thread_scratch_size(0)) /
-                  m_vector_size;
+    m_team_size = m_team_size >= 0 ? m_team_size
+                                   : arg_policy.team_size_recommended(
+                                         arg_functor, ParallelReduceTag());
 
     m_team_begin =
         UseShflReduction
@@ -958,8 +982,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                       "L0 scratch memory"));
     }
 
-    if (static_cast<int>(m_team_size) >
-        arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) {
+    size_t max_size =
+        arg_policy.team_size_max(arg_functor, ParallelReduceTag());
+    if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) {
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelReduce< HIP > requested too "
                       "large team size."));
@@ -992,18 +1017,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_scratch_lock(m_policy.space()
                            .impl_internal_space_instance()
                            ->m_team_scratch_mutex) {
-    hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
-        ParallelReduce, launch_bounds>::get_hip_func_attributes();
-    m_team_size =
-        m_team_size >= 0
-            ? m_team_size
-            : Kokkos::Experimental::Impl::hip_get_opt_block_size<FunctorType,
-                                                                 launch_bounds>(
-                  m_policy.space().impl_internal_space_instance(), attr,
-                  m_functor, m_vector_size, m_policy.team_scratch_size(0),
-                  m_policy.thread_scratch_size(0)) /
-                  m_vector_size;
-
+    m_team_size = m_team_size >= 0
+                      ? m_team_size
+                      : arg_policy.team_size_recommended(arg_functor, reducer,
+                                                         ParallelReduceTag());
     m_team_begin =
         UseShflReduction
             ? 0
@@ -1046,7 +1063,6 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // upon team size.
 
     const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
-
     if ((!Kokkos::Impl::is_integral_power_of_two(m_team_size) &&
          !UseShflReduction) ||
         m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
@@ -1054,8 +1070,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size"));
     }
-    if (static_cast<int>(m_team_size) >
-        arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) {
+
+    size_t max_size =
+        arg_policy.team_size_max(arg_functor, reducer, ParallelReduceTag());
+    if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) {
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelReduce< HIP > requested too "
                       "large team size."));
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
index 15ca089d14740b6a2c42c69945a17a0c7bfa1bcc..e25ebe2ab355e626273aeff34615db45aa3465c7 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
@@ -67,102 +67,32 @@ namespace {
 hipStream_t get_deep_copy_stream() {
   static hipStream_t s = nullptr;
   if (s == nullptr) {
-    HIP_SAFE_CALL(hipStreamCreate(&s));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&s));
   }
   return s;
 }
 }  // namespace
 
-DeepCopy<Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
+void DeepCopyHIP(void* dst, void const* src, size_t n) {
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
 }
 
-DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(const Kokkos::Experimental::HIP&
-                                                  instance,
-                                              void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIP>::
-    DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
-             const void* src, size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace, Kokkos::Experimental::HIP>::
-    DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
-             const void* src, size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIPHostPinnedSpace, Kokkos::Experimental::HIP>::
-    DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
-             const void* src, size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(const Kokkos::Experimental::HIP&
-                                                  instance,
-                                              void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(const Kokkos::Experimental::HIP&
-                                                  instance,
-                                              void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(
+void DeepCopyAsyncHIP(const Kokkos::Experimental::HIP& instance, void* dst,
+                      void const* src, size_t n) {
+  KOKKOS_IMPL_HIP_SAFE_CALL(
       hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
 }
 
 void DeepCopyAsyncHIP(void* dst, void const* src, size_t n) {
   hipStream_t s = get_deep_copy_stream();
-  HIP_SAFE_CALL(hipMemcpyAsync(dst, src, n, hipMemcpyDefault, s));
-  HIP_SAFE_CALL(hipStreamSynchronize(s));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyAsync(dst, src, n, hipMemcpyDefault, s));
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::HIP>(
+      "Kokkos::Impl::DeepCopyAsyncHIP: Post Deep Copy Fence on Deep-Copy "
+      "stream",
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          DeepCopyResourceSynchronization,
+      [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(s)); });
 }
 
 }  // namespace Impl
@@ -171,6 +101,7 @@ void DeepCopyAsyncHIP(void* dst, void const* src, size_t n) {
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 namespace Kokkos {
 
 KOKKOS_DEPRECATED void Experimental::HIPSpace::access_error() {
@@ -188,6 +119,7 @@ KOKKOS_DEPRECATED void Experimental::HIPSpace::access_error(const void* const) {
 }
 
 }  // namespace Kokkos
+#endif
 
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
@@ -283,7 +215,7 @@ void HIPSpace::impl_deallocate(
     Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
                                       reported_size);
   }
-  HIP_SAFE_CALL(hipFree(arg_alloc_ptr));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(arg_alloc_ptr));
 }
 
 void HIPHostPinnedSpace::deallocate(void* const arg_alloc_ptr,
@@ -307,7 +239,7 @@ void HIPHostPinnedSpace::impl_deallocate(
     Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
                                       reported_size);
   }
-  HIP_SAFE_CALL(hipHostFree(arg_alloc_ptr));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(arg_alloc_ptr));
 }
 
 }  // namespace Experimental
@@ -427,23 +359,42 @@ HIP::HIP()
       "HIP instance constructor");
 }
 
-HIP::HIP(hipStream_t const stream)
+HIP::HIP(hipStream_t const stream, bool manage_stream)
     : m_space_instance(new Impl::HIPInternal, [](Impl::HIPInternal* ptr) {
         ptr->finalize();
         delete ptr;
       }) {
   Impl::HIPInternal::singleton().verify_is_initialized(
       "HIP instance constructor");
-  m_space_instance->initialize(Impl::HIPInternal::singleton().m_hipDev, stream);
+  m_space_instance->initialize(Impl::HIPInternal::singleton().m_hipDev, stream,
+                               manage_stream);
 }
 
 void HIP::print_configuration(std::ostream& s, const bool) {
   Impl::HIPInternal::singleton().print_configuration(s);
 }
 
-void HIP::impl_static_fence() { HIP_SAFE_CALL(hipDeviceSynchronize()); }
+uint32_t HIP::impl_instance_id() const noexcept {
+  return m_space_instance->impl_get_instance_id();
+}
+void HIP::impl_static_fence(const std::string& name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::HIP>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); });
+}
+void HIP::impl_static_fence() {
+  impl_static_fence("Kokkos::HIP::impl_static_fence: Unnamed Static Fence");
+}
 
-void HIP::fence() const { m_space_instance->fence(); }
+void HIP::fence(const std::string& name) const {
+  m_space_instance->fence(name);
+}
+void HIP::fence() const {
+  fence("Kokkos::HIP::fence(): Unnamed Instance Fence");
+}
 
 hipStream_t HIP::hip_stream() const { return m_space_instance->m_stream; }
 
@@ -489,6 +440,9 @@ void HIPSpaceInitializer::finalize(const bool all_spaces) {
 void HIPSpaceInitializer::fence() {
   Kokkos::Experimental::HIP::impl_static_fence();
 }
+void HIPSpaceInitializer::fence(const std::string& name) {
+  Kokkos::Experimental::HIP::impl_static_fence(name);
+}
 
 void HIPSpaceInitializer::print_configuration(std::ostream& msg,
                                               const bool detail) {
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp
index fe52886ced7c7a72454f9e731b3b5b4778f90073..fb67a25c5e7f5e3b0a48118ffe14372f0b1cd2dc 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp
@@ -316,198 +316,6 @@ class HIPTeamMember {
 #endif
   }
 
-  //--------------------------------------------------------------------------
-  /**\brief  Global reduction across all blocks
-   *
-   *  Return !0 if reducer contains the final value
-   */
-  template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
-      typename std::enable_if<is_reducer<ReducerType>::value, int>::type
-      global_reduce(ReducerType const& reducer, int* const global_scratch_flags,
-                    void* const global_scratch_space, void* const shmem,
-                    int const shmem_size) {
-#ifdef __HIP_DEVICE_COMPILE__
-    using value_type   = typename ReducerType::value_type;
-    using pointer_type = value_type volatile*;
-
-    // Number of shared memory entries for the reduction:
-    const int nsh = shmem_size / sizeof(value_type);
-
-    // Number of HIP threads in the block, rank within the block
-    const int nid = blockDim.x * blockDim.y * blockDim.z;
-    const int tid =
-        threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
-
-    // Reduces within block using all available shared memory
-    // Contributes if it is the root "vector lane"
-
-    // wn == number of warps in the block
-    // wx == which lane within the warp
-    // wy == which warp within the block
-
-    const int wn = (nid + Experimental::Impl::HIPTraits::WarpIndexMask) >>
-                   Experimental::Impl::HIPTraits::WarpIndexShift;
-    const int wx = tid & Experimental::Impl::HIPTraits::WarpIndexMask;
-    const int wy = tid >> Experimental::Impl::HIPTraits::WarpIndexShift;
-
-    //------------------------
-    {  // Intra warp shuffle reduction from contributing HIP threads
-
-      value_type tmp(reducer.reference());
-
-      int constexpr warp_size =
-          ::Kokkos::Experimental::Impl::HIPTraits::WarpSize;
-      for (int i = warp_size; static_cast<int>(blockDim.x) <= (i >>= 1);) {
-        Experimental::Impl::in_place_shfl_down(reducer.reference(), tmp, i,
-                                               warp_size);
-
-        // Root of each vector lane reduces "thread" contribution
-        if (0 == threadIdx.x && wx < i) {
-          reducer.join(&tmp, reducer.data());
-        }
-      }
-
-      // Reduce across warps using shared memory.
-      // Number of warps may not be power of two.
-
-      __syncthreads();  // Wait before shared data write
-
-      // Number of shared memory entries for the reduction
-      // is at most one per warp
-      const int nentry = wn < nsh ? wn : nsh;
-
-      if (0 == wx && wy < nentry) {
-        // Root thread of warp 'wy' has warp's value to contribute
-        (reinterpret_cast<value_type*>(shmem))[wy] = tmp;
-      }
-
-      __syncthreads();  // Wait for write to be visible to block
-
-      // When more warps than shared entries
-      // then warps must take turns joining their contribution
-      // to the designated shared memory entry.
-      for (int i = nentry; i < wn; i += nentry) {
-        const int k = wy - i;
-
-        if (0 == wx && i <= wy && k < nentry) {
-          // Root thread of warp 'wy' has warp's value to contribute
-          reducer.join((reinterpret_cast<value_type*>(shmem)) + k, &tmp);
-        }
-
-        __syncthreads();  // Wait for write to be visible to block
-      }
-
-      // One warp performs the inter-warp reduction:
-
-      if (0 == wy) {
-        // Start fan-in at power of two covering nentry
-
-        for (int i = (1 << (warp_size - __clz(nentry - 1))); (i >>= 1);) {
-          const int k = wx + i;
-          if (wx < i && k < nentry) {
-            reducer.join((reinterpret_cast<pointer_type>(shmem)) + wx,
-                         (reinterpret_cast<pointer_type>(shmem)) + k);
-            __threadfence_block();  // Wait for write to be visible to warp
-          }
-        }
-      }
-    }
-    //------------------------
-    {  // Write block's value to global_scratch_memory
-
-      int last_block = 0;
-
-      if (0 == wx) {
-        reducer.copy((reinterpret_cast<pointer_type>(global_scratch_space)) +
-                         blockIdx.x * reducer.length(),
-                     reducer.data());
-
-        __threadfence();  // Wait until global write is visible.
-
-        last_block = static_cast<int>(gridDim.x) ==
-                     1 + Kokkos::atomic_fetch_add(global_scratch_flags, 1);
-
-        // If last block then reset count
-        if (last_block) *global_scratch_flags = 0;
-      }
-
-      // FIXME hip does not support __syncthreads_or so we need to do it by hand
-      // last_block = __syncthreads_or(last_block);
-
-      __shared__ int last_block_shared;
-      if (last_block) last_block_shared = last_block;
-      __threadfence_block();
-
-      if (!last_block_shared) return 0;
-    }
-    //------------------------
-    // Last block reads global_scratch_memory into shared memory.
-
-    const int nentry = nid < gridDim.x ? (nid < nsh ? nid : nsh)
-                                       : (gridDim.x < nsh ? gridDim.x : nsh);
-
-    // nentry = min( nid , nsh , gridDim.x )
-
-    // whole block reads global memory into shared memory:
-
-    if (tid < nentry) {
-      const int offset = tid * reducer.length();
-
-      reducer.copy(
-          (reinterpret_cast<pointer_type>(shmem)) + offset,
-          (reinterpret_cast<pointer_type>(global_scratch_space)) + offset);
-
-      for (int i = nentry + tid; i < static_cast<int>(gridDim.x); i += nentry) {
-        reducer.join((reinterpret_cast<pointer_type>(shmem)) + offset,
-                     (reinterpret_cast<pointer_type>(global_scratch_space)) +
-                         i * reducer.length());
-      }
-    }
-
-    __syncthreads();  // Wait for writes to be visible to block
-
-    if (0 == wy) {
-      // Iterate to reduce shared memory to single warp fan-in size
-
-      int constexpr warp_size =
-          ::Kokkos::Experimental::Impl::HIPTraits::WarpSize;
-      const int nreduce = warp_size < nentry ? warp_size : nentry;
-
-      if (wx < nreduce && nreduce < nentry) {
-        for (int i = nreduce + wx; i < nentry; i += nreduce) {
-          reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + i);
-        }
-        __threadfence_block();  // Wait for writes to be visible to warp
-      }
-
-      // Start fan-in at power of two covering nentry
-
-      for (int i = (1 << (warp_size - __clz(nreduce - 1))); (i >>= 1);) {
-        const int k = wx + i;
-        if (wx < i && k < nreduce) {
-          reducer.join((reinterpret_cast<pointer_type>(shmem)) + wx,
-                       (reinterpret_cast<pointer_type>(shmem)) + k);
-          __threadfence_block();  // Wait for writes to be visible to warp
-        }
-      }
-
-      if (0 == wx) {
-        reducer.copy(reducer.data(), reinterpret_cast<pointer_type>(shmem));
-        return 1;
-      }
-    }
-    return 0;
-#else
-    (void)reducer;
-    (void)global_scratch_flags;
-    (void)global_scratch_space;
-    (void)shmem;
-    (void)shmem_size;
-    return 0;
-#endif
-  }
-
   //----------------------------------------
   // Private for the driver
 
diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp b/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp
index 910d5e52e6ac62e290f4d7c918217aa518ec3ec7..d9cb66e11f4638c8635d0e9e33d7cbda67e1cada 100644
--- a/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp
+++ b/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp
@@ -191,6 +191,9 @@ void HPXSpaceInitializer::finalize(const bool all_spaces) {
 }
 
 void HPXSpaceInitializer::fence() { Kokkos::Experimental::HPX().fence(); }
+void HPXSpaceInitializer::fence(const std::string &name) {
+  Kokkos::Experimental::HPX().fence(name);
+}
 
 void HPXSpaceInitializer::print_configuration(std::ostream &msg,
                                               const bool detail) {
diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp b/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp
index df09e026fd9b45bc1c4f7d0c55e5ae10d336ad72..7bb3ca5d007023d99314c8d45de748da7836136d 100644
--- a/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp
+++ b/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp
@@ -82,7 +82,9 @@ class TaskQueueSpecialization<
     task_queue.scheduler = &scheduler;
     Kokkos::Impl::dispatch_execute_task(&task_queue,
                                         Kokkos::Experimental::HPX());
-    Kokkos::Experimental::HPX().fence();
+    Kokkos::Experimental::HPX().fence(
+        "Kokkos::Impl::TaskQueueSpecialization<SimpleTask>::execute: fence "
+        "after task execution");
   }
 
   // Must provide task queue execution function
@@ -214,7 +216,7 @@ class TaskQueueSpecializationConstrained<
     task_queue.scheduler = &scheduler;
     Kokkos::Impl::dispatch_execute_task(&task_queue,
                                         Kokkos::Experimental::HPX());
-    Kokkos::Experimental::HPX().fence();
+    Kokkos::Experimental::HPX().fence()"Kokkos::Impl::TaskQueueSpecializationConstrained::execute: fence after task execution";
   }
 
   // Must provide task queue execution function
diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp b/packages/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp
index 527fe12ad937f9b89029f12d5c64044f40671572..d7e13e28f054569926382933232b7119ca96a192 100644
--- a/packages/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp
+++ b/packages/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp
@@ -79,7 +79,9 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
  public:
   void execute() const {
     dispatch_execute_task(this, m_policy.space());
-    m_policy.space().fence();
+    m_policy.space().fence(
+        "Kokkos::Experimental::Impl::HPX::ParallelFor<WorkGraphPolicy>: fence "
+        "after kernel execution");
   }
 
   void execute_task() const {
diff --git a/packages/kokkos/core/src/KokkosExp_InterOp.hpp b/packages/kokkos/core/src/KokkosExp_InterOp.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..37c2088f88f08758f5f1585b7138f43dd73d54eb
--- /dev/null
+++ b/packages/kokkos/core/src/KokkosExp_InterOp.hpp
@@ -0,0 +1,147 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CORE_EXP_INTEROP_HPP
+#define KOKKOS_CORE_EXP_INTEROP_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <Kokkos_View.hpp>
+#include <impl/Kokkos_Utilities.hpp>
+#include <type_traits>
+
+namespace Kokkos {
+namespace Impl {
+
+// ------------------------------------------------------------------ //
+//  this is used to convert
+//      Kokkos::Device<ExecSpace, MemSpace> to MemSpace
+//
+template <typename Tp>
+struct device_memory_space {
+  using type = Tp;
+};
+
+template <typename ExecT, typename MemT>
+struct device_memory_space<Kokkos::Device<ExecT, MemT>> {
+  using type = MemT;
+};
+
+template <typename Tp>
+using device_memory_space_t = typename device_memory_space<Tp>::type;
+
+// ------------------------------------------------------------------ //
+//  this is the impl version which takes a view and converts to python
+//  view type
+//
+template <typename, typename...>
+struct python_view_type_impl;
+
+template <template <typename...> class ViewT, typename ValueT,
+          typename... Types>
+struct python_view_type_impl<ViewT<ValueT>, type_list<Types...>> {
+  using type = ViewT<ValueT, device_memory_space_t<Types>...>;
+};
+
+template <template <typename...> class ViewT, typename ValueT,
+          typename... Types>
+struct python_view_type_impl<ViewT<ValueT, Types...>>
+    : python_view_type_impl<ViewT<ValueT>,
+                            filter_type_list_t<is_default_memory_trait,
+                                               type_list<Types...>, false>> {};
+
+template <typename... T>
+using python_view_type_impl_t = typename python_view_type_impl<T...>::type;
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+namespace Kokkos {
+
+template <typename DataType, class... Properties>
+class DynRankView;
+
+namespace Impl {
+
+// Duplicate from the header file for DynRankView to avoid core depending on
+// containers.
+template <class>
+struct is_dyn_rank_view_dup : public std::false_type {};
+
+template <class D, class... P>
+struct is_dyn_rank_view_dup<Kokkos::DynRankView<D, P...>>
+    : public std::true_type {};
+
+}  // namespace Impl
+
+namespace Experimental {
+
+// ------------------------------------------------------------------ //
+//  this is used to extract the uniform type of a view
+//
+template <typename ViewT>
+struct python_view_type {
+  static_assert(
+      Kokkos::is_view<std::decay_t<ViewT>>::value ||
+          Kokkos::Impl::is_dyn_rank_view_dup<std::decay_t<ViewT>>::value,
+      "Error! python_view_type only supports Kokkos::View and "
+      "Kokkos::DynRankView");
+
+  using type =
+      Kokkos::Impl::python_view_type_impl_t<typename ViewT::array_type>;
+};
+
+template <typename ViewT>
+using python_view_type_t = typename python_view_type<ViewT>::type;
+
+template <typename Tp>
+auto as_python_type(Tp&& _v) {
+  using cast_type = python_view_type_t<Tp>;
+  return static_cast<cast_type>(std::forward<Tp>(_v));
+}
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
index b7d8e62f696073bfa4794b362401aaca288de021..dfae7451fc302362743c9349485ee574a15a2d76 100644
--- a/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
+++ b/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
@@ -48,6 +48,7 @@
 #include <initializer_list>
 
 #include <Kokkos_Layout.hpp>
+#include <Kokkos_Rank.hpp>
 #include <Kokkos_Array.hpp>
 #include <impl/KokkosExp_Host_IterateTile.hpp>
 #include <Kokkos_ExecPolicy.hpp>
@@ -78,22 +79,6 @@ struct default_inner_direction {
   static constexpr Iterate value = Iterate::Right;
 };
 
-// Iteration Pattern
-template <unsigned N, Iterate OuterDir = Iterate::Default,
-          Iterate InnerDir = Iterate::Default>
-struct Rank {
-  static_assert(N != 0u, "Kokkos Error: rank 0 undefined");
-  static_assert(N != 1u,
-                "Kokkos Error: rank 1 is not a multi-dimensional range");
-  static_assert(N < 7u, "Kokkos Error: Unsupported rank...");
-
-  using iteration_pattern = Rank<N, OuterDir, InnerDir>;
-
-  static constexpr int rank                = N;
-  static constexpr Iterate outer_direction = OuterDir;
-  static constexpr Iterate inner_direction = InnerDir;
-};
-
 namespace Impl {
 // NOTE the comparison below is encapsulated to silent warnings about pointless
 // comparison of unsigned integer with zero
@@ -397,13 +382,18 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
 
 }  // namespace Kokkos
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 // For backward compatibility
 namespace Kokkos {
 namespace Experimental {
-using Kokkos::Iterate;
-using Kokkos::MDRangePolicy;
-using Kokkos::Rank;
+using Iterate KOKKOS_DEPRECATED = Kokkos::Iterate;
+template <typename... Properties>
+using MDRangePolicy KOKKOS_DEPRECATED = Kokkos::MDRangePolicy<Properties...>;
+template <unsigned N, Kokkos::Iterate OuterDir = Kokkos::Iterate::Default,
+          Kokkos::Iterate InnerDir = Kokkos::Iterate::Default>
+using Rank KOKKOS_DEPRECATED = Kokkos::Rank<N, OuterDir, InnerDir>;
 }  // namespace Experimental
 }  // namespace Kokkos
+#endif
 
 #endif  // KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
diff --git a/packages/kokkos/core/src/Kokkos_Atomic.hpp b/packages/kokkos/core/src/Kokkos_Atomic.hpp
index 8cd60fa6bae993895ac901fbbab8eb532a6a0ded..a47208e97782ec424a2a96b5ec4de0c58fe2fef2 100644
--- a/packages/kokkos/core/src/Kokkos_Atomic.hpp
+++ b/packages/kokkos/core/src/Kokkos_Atomic.hpp
@@ -69,6 +69,60 @@
 #define KOKKOS_ATOMIC_HPP
 
 #include <Kokkos_Macros.hpp>
+
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+#define DESUL_HAVE_OPENMP_ATOMICS
+#endif
+#include <Kokkos_Atomics_Desul_Wrapper.hpp>
+#include <Kokkos_Atomics_Desul_Volatile_Wrapper.hpp>
+#include <impl/Kokkos_Utilities.hpp>
+
+// Helper functions for places where we really should have called SeqCst atomics
+// anyway These can go away when we call desul unconditionally Non-Desul
+// versions are below
+namespace Kokkos {
+namespace Impl {
+using desul::MemoryOrderSeqCst;
+using desul::MemoryScopeDevice;
+
+template <class T>
+KOKKOS_INLINE_FUNCTION void desul_atomic_dec(T* dest, MemoryOrderSeqCst,
+                                             MemoryScopeDevice) {
+  return desul::atomic_dec(const_cast<T*>(dest), desul::MemoryOrderSeqCst(),
+                           desul::MemoryScopeDevice());
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION void desul_atomic_inc(T* dest, MemoryOrderSeqCst,
+                                             MemoryScopeDevice) {
+  return desul::atomic_inc(const_cast<T*>(dest), desul::MemoryOrderSeqCst(),
+                           desul::MemoryScopeDevice());
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION T
+desul_atomic_exchange(T* dest, const Kokkos::Impl::identity_t<T> val,
+                      MemoryOrderSeqCst, MemoryScopeDevice) {
+  return desul::atomic_exchange(const_cast<T*>(dest), val,
+                                desul::MemoryOrderSeqCst(),
+                                desul::MemoryScopeDevice());
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION T desul_atomic_compare_exchange(
+    T* dest, Kokkos::Impl::identity_t<const T> compare,
+    Kokkos::Impl::identity_t<const T> val, MemoryOrderSeqCst,
+    MemoryScopeDevice) {
+  return desul::atomic_compare_exchange(dest, compare, val,
+                                        desul::MemoryOrderSeqCst(),
+                                        desul::MemoryScopeDevice());
+}
+
+}  // namespace Impl
+}  // namespace Kokkos
+#else
+
 #include <Kokkos_HostSpace.hpp>
 #include <impl/Kokkos_Traits.hpp>
 
@@ -326,4 +380,42 @@ inline const char* atomic_query_version() {
 
 //----------------------------------------------------------------------------
 
+// Helper functions for places where we really should have called SeqCst atomics
+// anyway These can go away when we call desul unconditionally
+namespace Kokkos {
+namespace Impl {
+struct MemoryOrderSeqCst {};
+struct MemoryScopeDevice {};
+
+template <class T>
+KOKKOS_INLINE_FUNCTION void desul_atomic_dec(T* dest, MemoryOrderSeqCst,
+                                             MemoryScopeDevice) {
+  return Kokkos::atomic_decrement(dest);
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION void desul_atomic_inc(T* dest, MemoryOrderSeqCst,
+                                             MemoryScopeDevice) {
+  return Kokkos::atomic_increment(dest);
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION T
+desul_atomic_exchange(T* dest, Kokkos::Impl::identity_t<const T> val,
+                      MemoryOrderSeqCst, MemoryScopeDevice) {
+  return Kokkos::atomic_exchange(dest, val);
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION T desul_atomic_compare_exchange(
+    T* dest, Kokkos::Impl::identity_t<const T> compare,
+    Kokkos::Impl::identity_t<const T> val, MemoryOrderSeqCst,
+    MemoryScopeDevice) {
+  return Kokkos::atomic_compare_exchange(dest, compare, val);
+}
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif /* !KOKKOS_ENABLE_IMPL_DESUL_ATOMICS */
 #endif /* KOKKOS_ATOMIC_HPP */
diff --git a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0bcb3ea388beeaf72c862f9519572e5d9e13a530
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp
@@ -0,0 +1,189 @@
+#ifndef KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_
+#define KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#include <desul/atomics.hpp>
+
+// clang-format off
+namespace Kokkos { 
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_load(volatile T* const dest) { return desul::atomic_load(const_cast<T*>(dest), desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_store(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_store(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// atomic_fetch_op
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_add (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_add (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+#ifdef DESUL_IMPL_ATOMIC_CUDA_USE_DOUBLE_ATOMICADD
+KOKKOS_INLINE_FUNCTION
+double atomic_fetch_add(volatile double* const dest, double val) {
+  #ifdef __CUDA_ARCH__
+  return atomicAdd(const_cast<double*>(dest),val);
+  #else
+  return desul::atomic_fetch_add (const_cast<double*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+  #endif
+};
+
+KOKKOS_INLINE_FUNCTION
+double atomic_fetch_sub(volatile double* const dest, double val) {
+  #ifdef __CUDA_ARCH__
+  return atomicAdd(const_cast<double*>(dest),-val);
+  #else
+  return desul::atomic_fetch_sub (const_cast<double*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+  #endif
+};
+#endif
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_sub (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_sub (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_max (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_max (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_min (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_min (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mul (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_mul (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_div (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_div (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mod (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_mod (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_and (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_and (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_or  (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_or  (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_xor (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_xor (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_nand(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_nand(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_lshift(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_lshift(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_rshift(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_rshift(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_inc(volatile T* const dest) { return desul::atomic_fetch_inc(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_dec(volatile T* const dest) { return desul::atomic_fetch_dec(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+
+// atomic_op_fetch
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_add_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_add_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_sub_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_sub_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_max_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_max_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_min_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_min_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_mul_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mul_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_div_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_div_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_mod_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mod_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_and_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_and_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_or_fetch  (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_or_fetch  (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_xor_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_xor_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_nand_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_nand_fetch(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_lshift_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_lshift_fetch(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_rshift_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_rshift_fetch(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_inc_fetch(volatile T* const dest) { return desul::atomic_inc_fetch(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_dec_fetch(volatile T* const dest) { return desul::atomic_dec_fetch(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+
+// atomic_op
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_add(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_add (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_sub(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_sub (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_mul(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mul (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_div(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_div (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_min(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_min (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_max(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_max (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// FIXME: Desul doesn't have atomic_and yet so call fetch_and
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_and(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { (void) desul::atomic_fetch_and (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// FIXME: Desul doesn't have atomic_or yet so call fetch_or
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_or (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { (void) desul::atomic_fetch_or  (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_inc(volatile T* const dest) { return desul::atomic_inc(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_dec(volatile T* const dest) { return desul::atomic_dec(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_increment(volatile T* const dest) { return desul::atomic_inc(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_decrement(volatile T* const dest) { return desul::atomic_dec(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// Exchange
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_exchange(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_exchange(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+bool atomic_compare_exchange_strong(volatile T* const dest, T& expected, const T desired) {
+  return desul::atomic_compare_exchange_strong(const_cast<T*>(dest),expected, desired,
+                  desul::MemoryOrderRelaxed(), desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+}
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange(volatile T* const dest, const T compare, const T desired) {
+  return desul::atomic_compare_exchange(const_cast<T*>(dest),compare, desired,
+                  desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+}
+
+}
+// clang-format on
+#endif  // KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#endif
diff --git a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3a182a6a22b56ca424d13e3d9f0835070f1cb2f6
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp
@@ -0,0 +1,271 @@
+#ifndef KOKKOS_DESUL_ATOMICS_WRAPPER_HPP_
+#define KOKKOS_DESUL_ATOMICS_WRAPPER_HPP_
+#include <Kokkos_Macros.hpp>
+
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#include <desul/atomics.hpp>
+
+#include <impl/Kokkos_Atomic_Memory_Order.hpp>
+#include <impl/Kokkos_Volatile_Load.hpp>
+
+// clang-format off
+namespace Kokkos {
+
+// FIXME: These functions don't have any use/test in unit tests ...
+// ==========================================================
+inline const char* atomic_query_version() { return "KOKKOS_DESUL_ATOMICS"; }
+
+#if defined(KOKKOS_COMPILER_GNU) && !defined(__PGIC__) && \
+    !defined(__CUDA_ARCH__)
+
+#define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) __builtin_prefetch(addr, 0, 0)
+#define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) __builtin_prefetch(addr, 1, 0)
+
+#else
+
+#define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) ((void)0)
+#define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) ((void)0)
+
+#endif
+// ============================================================
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_load(T* const dest) { return desul::atomic_load(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_store(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_store(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_assign(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { atomic_store(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+void memory_fence() {
+  desul::atomic_thread_fence(desul::MemoryOrderSeqCst(), desul::MemoryScopeDevice());
+}
+
+KOKKOS_INLINE_FUNCTION
+void load_fence() { return desul::atomic_thread_fence(desul::MemoryOrderAcquire(), desul::MemoryScopeDevice()); }
+
+KOKKOS_INLINE_FUNCTION
+void store_fence() { return desul::atomic_thread_fence(desul::MemoryOrderRelease(), desul::MemoryScopeDevice()); }
+
+// atomic_fetch_op
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_add (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_add (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+#ifdef DESUL_IMPL_ATOMIC_CUDA_USE_DOUBLE_ATOMICADD
+KOKKOS_INLINE_FUNCTION
+double atomic_fetch_add(double* const dest, double val) {
+  #ifdef __CUDA_ARCH__
+  return atomicAdd(dest,val);
+  #else
+  return desul::atomic_fetch_add (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+  #endif
+};
+
+KOKKOS_INLINE_FUNCTION
+double atomic_fetch_sub(double* const dest, double val) {
+  #ifdef __CUDA_ARCH__
+  return atomicAdd(dest,-val);
+  #else
+  return desul::atomic_fetch_sub (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+  #endif
+};
+#endif
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_sub (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_sub (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_max (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_max (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_min (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_min (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mul (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_mul (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_div (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_div (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mod (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_mod (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_and (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_and (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_or  (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_or  (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_xor (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_xor (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_nand(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_nand(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_lshift(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_lshift(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_rshift(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_rshift(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_inc(T* const dest) { return desul::atomic_fetch_inc(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_dec(T* const dest) { return desul::atomic_fetch_dec(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+
+// atomic_op_fetch
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_add_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_add_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_sub_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_sub_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_max_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_max_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_min_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_min_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_mul_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mul_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_div_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_div_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_mod_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mod_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_and_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_and_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_or_fetch  (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_or_fetch  (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_xor_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_xor_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_nand_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_nand_fetch(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_lshift_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_lshift_fetch(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_rshift_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_rshift_fetch(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_inc_fetch(T* const dest) { return desul::atomic_inc_fetch(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_dec_fetch(T* const dest) { return desul::atomic_dec_fetch(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+
+// atomic_op
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_add(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_add (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_sub(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_sub (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_mul(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mul (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_div(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_div (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_min(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_min (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_max(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_max (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// FIXME: Desul doesn't have atomic_and yet so call fetch_and
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_and(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { (void) desul::atomic_fetch_and (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// FIXME: Desul doesn't have atomic_or yet so call fetch_or
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_or(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val)  { (void) desul::atomic_fetch_or (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_inc(T* const dest) { return desul::atomic_inc(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_dec(T* const dest) { return desul::atomic_dec(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_increment(T* const dest) { return desul::atomic_inc(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_decrement(T* const dest) { return desul::atomic_dec(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// Exchange
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_exchange(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_exchange(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+bool atomic_compare_exchange_strong(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> expected, desul::Impl::dont_deduce_this_parameter_t<const T> desired) {
+  T expected_ref = expected;
+  return desul::atomic_compare_exchange_strong(dest, expected_ref, desired,
+                  desul::MemoryOrderRelaxed(), desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+}
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> compare, desul::Impl::dont_deduce_this_parameter_t<const T> desired) {
+  return desul::atomic_compare_exchange(dest, compare, desired,
+                  desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+}
+
+namespace Impl {
+
+  template<class MemoryOrder>
+  struct KokkosToDesulMemoryOrder;
+
+  template<>
+  struct KokkosToDesulMemoryOrder<memory_order_seq_cst_t> {
+    using type = desul::MemoryOrderSeqCst;
+  };
+  template<>
+  struct KokkosToDesulMemoryOrder<memory_order_acquire_t> {
+    using type = desul::MemoryOrderAcquire;
+  };
+  template<>
+  struct KokkosToDesulMemoryOrder<memory_order_release_t> {
+    using type = desul::MemoryOrderRelease;
+  };
+  template<>
+  struct KokkosToDesulMemoryOrder<memory_order_acq_rel_t> {
+    using type = desul::MemoryOrderAcqRel;
+  };
+  template<>
+  struct KokkosToDesulMemoryOrder<memory_order_relaxed_t> {
+    using type = desul::MemoryOrderRelaxed;
+  };
+  template<class T, class MemOrderSuccess, class MemOrderFailure> KOKKOS_INLINE_FUNCTION
+  bool atomic_compare_exchange_strong(T* const dest, T& expected, const T desired, MemOrderSuccess, MemOrderFailure) {
+    return desul::atomic_compare_exchange_strong(dest, expected, desired,
+                  typename KokkosToDesulMemoryOrder<MemOrderSuccess>::type(),
+                  typename KokkosToDesulMemoryOrder<MemOrderFailure>::type(),
+                  desul::MemoryScopeDevice());
+
+  }
+  template<class T, class MemoryOrder>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_load(const T* const src, MemoryOrder) {
+    return desul::atomic_load(src, typename KokkosToDesulMemoryOrder<MemoryOrder>::type(), desul::MemoryScopeDevice());
+  }
+  template<class T, class MemoryOrder>
+  KOKKOS_INLINE_FUNCTION
+  void atomic_store(T* const src, const T val, MemoryOrder) {
+    return desul::atomic_store(src, val, typename KokkosToDesulMemoryOrder<MemoryOrder>::type(), desul::MemoryScopeDevice());
+  }
+}
+
+}
+// clang-format on
+#endif  // KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#endif
diff --git a/packages/kokkos/core/src/Kokkos_Complex.hpp b/packages/kokkos/core/src/Kokkos_Complex.hpp
index 6578723fc8e5dab1e605b1a5dc80f1daf4b2ebfb..466903ab7d6626c0cd7ff97754594cc17933367e 100644
--- a/packages/kokkos/core/src/Kokkos_Complex.hpp
+++ b/packages/kokkos/core/src/Kokkos_Complex.hpp
@@ -77,7 +77,7 @@ class
 
   //! Default constructor (initializes both real and imaginary parts to zero).
   KOKKOS_DEFAULTED_FUNCTION
-  complex() noexcept = default;
+  complex() = default;
 
   //! Copy constructor.
   KOKKOS_DEFAULTED_FUNCTION
@@ -150,11 +150,11 @@ class
 
   //! The imaginary part of this complex number.
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14 RealType& imag() noexcept { return im_; }
+  constexpr RealType& imag() noexcept { return im_; }
 
   //! The real part of this complex number.
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14 RealType& real() noexcept { return re_; }
+  constexpr RealType& real() noexcept { return re_; }
 
   //! The imaginary part of this complex number.
   KOKKOS_INLINE_FUNCTION
@@ -166,41 +166,39 @@ class
 
   //! Set the imaginary part of this complex number.
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  void imag(RealType v) noexcept { im_ = v; }
+  constexpr void imag(RealType v) noexcept { im_ = v; }
 
   //! Set the real part of this complex number.
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  void real(RealType v) noexcept { re_ = v; }
+  constexpr void real(RealType v) noexcept { re_ = v; }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator+=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator+=(
       const complex<RealType>& src) noexcept {
     re_ += src.re_;
     im_ += src.im_;
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator+=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator+=(
       const RealType& src) noexcept {
     re_ += src;
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator-=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator-=(
       const complex<RealType>& src) noexcept {
     re_ -= src.re_;
     im_ -= src.im_;
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator-=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator-=(
       const RealType& src) noexcept {
     re_ -= src;
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator*=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator*=(
       const complex<RealType>& src) noexcept {
     const RealType realPart = re_ * src.re_ - im_ * src.im_;
     const RealType imagPart = re_ * src.im_ + im_ * src.re_;
@@ -209,7 +207,7 @@ class
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator*=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator*=(
       const RealType& src) noexcept {
     re_ *= src;
     im_ *= src;
@@ -217,7 +215,7 @@ class
   }
 
   // Conditional noexcept, just in case RType throws on divide-by-zero
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator/=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator/=(
       const complex<RealType>& y) noexcept(noexcept(RealType{} / RealType{})) {
     using Kokkos::Experimental::fabs;
     // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
@@ -244,8 +242,7 @@ class
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14
-  KOKKOS_INLINE_FUNCTION complex& operator/=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator/=(
       const std::complex<RealType>& y) noexcept(noexcept(RealType{} /
                                                          RealType{})) {
     using Kokkos::Experimental::fabs;
@@ -272,7 +269,7 @@ class
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator/=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator/=(
       const RealType& src) noexcept(noexcept(RealType{} / RealType{})) {
     re_ /= src;
     im_ /= src;
@@ -688,12 +685,24 @@ KOKKOS_INLINE_FUNCTION RealType imag(const complex<RealType>& x) noexcept {
   return x.imag();
 }
 
+template <class ArithmeticType>
+KOKKOS_INLINE_FUNCTION constexpr Impl::promote_t<ArithmeticType> imag(
+    ArithmeticType) {
+  return ArithmeticType();
+}
+
 //! Real part of a complex number.
 template <class RealType>
 KOKKOS_INLINE_FUNCTION RealType real(const complex<RealType>& x) noexcept {
   return x.real();
 }
 
+template <class ArithmeticType>
+KOKKOS_INLINE_FUNCTION constexpr Impl::promote_t<ArithmeticType> real(
+    ArithmeticType x) {
+  return x;
+}
+
 //! Constructs a complex number from magnitude and phase angle
 template <class T>
 KOKKOS_INLINE_FUNCTION complex<T> polar(const T& r, const T& theta = T()) {
@@ -733,36 +742,6 @@ KOKKOS_INLINE_FUNCTION complex<T> pow(const complex<T>& x,
   return x == T() ? T() : exp(y * log(x));
 }
 
-namespace Impl {
-// NOTE promote would also be useful for math functions
-template <class T, bool = std::is_integral<T>::value>
-struct promote {
-  using type = double;
-};
-template <class T>
-struct promote<T, false> {};
-template <>
-struct promote<long double> {
-  using type = long double;
-};
-template <>
-struct promote<double> {
-  using type = double;
-};
-template <>
-struct promote<float> {
-  using type = float;
-};
-template <class T>
-using promote_t = typename promote<T>::type;
-template <class T, class U>
-struct promote_2 {
-  using type = decltype(promote_t<T>() + promote_t<U>());
-};
-template <class T, class U>
-using promote_2_t = typename promote_2<T, U>::type;
-}  // namespace Impl
-
 template <class T, class U,
           class = std::enable_if_t<std::is_arithmetic<T>::value>>
 KOKKOS_INLINE_FUNCTION complex<Impl::promote_2_t<T, U>> pow(
@@ -816,6 +795,13 @@ KOKKOS_INLINE_FUNCTION complex<RealType> conj(
   return complex<RealType>(real(x), -imag(x));
 }
 
+template <class ArithmeticType>
+KOKKOS_INLINE_FUNCTION constexpr complex<Impl::promote_t<ArithmeticType>> conj(
+    ArithmeticType x) {
+  using type = Impl::promote_t<ArithmeticType>;
+  return complex<type>(x, -type());
+}
+
 //! Exponential of a complex number.
 template <class RealType>
 KOKKOS_INLINE_FUNCTION complex<RealType> exp(const complex<RealType>& x) {
diff --git a/packages/kokkos/core/src/Kokkos_Concepts.hpp b/packages/kokkos/core/src/Kokkos_Concepts.hpp
index 2aba189487490d4f870cec407ec1d1f3b9ed001e..97137387f264a869dacb6b3b5abdd7fa5a9ba5ff 100644
--- a/packages/kokkos/core/src/Kokkos_Concepts.hpp
+++ b/packages/kokkos/core/src/Kokkos_Concepts.hpp
@@ -180,20 +180,23 @@ KOKKOS_IMPL_IS_CONCEPT(work_item_property)
 
 namespace Impl {
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 // For backward compatibility:
 
-using Kokkos::is_array_layout;
-using Kokkos::is_execution_policy;
-using Kokkos::is_execution_space;
-using Kokkos::is_memory_space;
-using Kokkos::is_memory_traits;
+template <typename T>
+using is_array_layout KOKKOS_DEPRECATED = Kokkos::is_array_layout<T>;
+template <typename T>
+using is_execution_policy KOKKOS_DEPRECATED = Kokkos::is_execution_policy<T>;
+template <typename T>
+using is_execution_space KOKKOS_DEPRECATED = Kokkos::is_execution_space<T>;
+template <typename T>
+using is_memory_space KOKKOS_DEPRECATED = Kokkos::is_memory_space<T>;
+template <typename T>
+using is_memory_traits KOKKOS_DEPRECATED = Kokkos::is_memory_traits<T>;
+#endif
 
 // Implementation concept:
 
-KOKKOS_IMPL_IS_CONCEPT(iteration_pattern)
-KOKKOS_IMPL_IS_CONCEPT(schedule_type)
-KOKKOS_IMPL_IS_CONCEPT(index_type)
-KOKKOS_IMPL_IS_CONCEPT(launch_bounds)
 KOKKOS_IMPL_IS_CONCEPT(thread_team_member)
 KOKKOS_IMPL_IS_CONCEPT(host_thread_team_member)
 KOKKOS_IMPL_IS_CONCEPT(graph_kernel)
@@ -330,42 +333,65 @@ struct is_space {
   // For backward compatibility, deprecated in favor of
   // Kokkos::Impl::HostMirror<S>::host_mirror_space
 
-  using host_memory_space = typename std::conditional<
+ private:
+  // The actual definitions for host_memory_space and host_execution_spaces are
+  // in do_not_use_host_memory_space and do_not_use_host_execution_space to be
+  // able to use them within this class without deprecation warnings.
+  using do_not_use_host_memory_space = std::conditional_t<
       std::is_same<memory_space, Kokkos::HostSpace>::value
 #if defined(KOKKOS_ENABLE_CUDA)
           || std::is_same<memory_space, Kokkos::CudaUVMSpace>::value ||
           std::is_same<memory_space, Kokkos::CudaHostPinnedSpace>::value
-#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */
+#elif defined(KOKKOS_ENABLE_HIP)
+          || std::is_same<memory_space,
+                          Kokkos::Experimental::HIPHostPinnedSpace>::value
+#elif defined(KOKKOS_ENABLE_SYCL)
+          || std::is_same<memory_space,
+                          Kokkos::Experimental::SYCLSharedUSMSpace>::value ||
+          std::is_same<memory_space,
+                       Kokkos::Experimental::SYCLHostUSMSpace>::value
+#endif
       ,
-      memory_space, Kokkos::HostSpace>::type;
+      memory_space, Kokkos::HostSpace>;
 
+  using do_not_use_host_execution_space = std::conditional_t<
 #if defined(KOKKOS_ENABLE_CUDA)
-  using host_execution_space = typename std::conditional<
-      std::is_same<execution_space, Kokkos::Cuda>::value,
-      Kokkos::DefaultHostExecutionSpace, execution_space>::type;
-#else
-#if defined(KOKKOS_ENABLE_OPENMPTARGET)
-  using host_execution_space = typename std::conditional<
-      std::is_same<execution_space, Kokkos::Experimental::OpenMPTarget>::value,
-      Kokkos::DefaultHostExecutionSpace, execution_space>::type;
-#else
-  using host_execution_space = execution_space;
-#endif
+      std::is_same<execution_space, Kokkos::Cuda>::value ||
+#elif defined(KOKKOS_ENABLE_HIP)
+      std::is_same<execution_space, Kokkos::Experimental::HIP>::value ||
+#elif defined(KOKKOS_ENABLE_SYCL)
+      std::is_same<execution_space, Kokkos::Experimental::SYCL>::value ||
+#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
+      std::is_same<execution_space,
+                   Kokkos::Experimental::OpenMPTarget>::value ||
 #endif
+          false,
+      Kokkos::DefaultHostExecutionSpace, execution_space>;
 
-  using host_mirror_space = typename std::conditional<
-      std::is_same<execution_space, host_execution_space>::value &&
-          std::is_same<memory_space, host_memory_space>::value,
-      T, Kokkos::Device<host_execution_space, host_memory_space>>::type;
+ public:
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+  using host_memory_space KOKKOS_DEPRECATED = do_not_use_host_memory_space;
+  using host_execution_space KOKKOS_DEPRECATED =
+      do_not_use_host_execution_space;
+  using host_mirror_space KOKKOS_DEPRECATED = std::conditional_t<
+      std::is_same<execution_space, do_not_use_host_execution_space>::value &&
+          std::is_same<memory_space, do_not_use_host_memory_space>::value,
+      T,
+      Kokkos::Device<do_not_use_host_execution_space,
+                     do_not_use_host_memory_space>>;
+#endif
 };
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 // For backward compatibility
 
 namespace Impl {
 
-using Kokkos::is_space;
+template <typename T>
+using is_space KOKKOS_DEPRECATED = Kokkos::is_space<T>;
 
 }
+#endif
 
 }  // namespace Kokkos
 
@@ -485,13 +511,18 @@ struct SpaceAccessibility {
 
 }  // namespace Kokkos
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 namespace Kokkos {
 namespace Impl {
 
-using Kokkos::SpaceAccessibility;  // For backward compatibility
+// For backward compatibility
+template <typename AccessSpace, typename MemorySpace>
+using SpaceAccessibility KOKKOS_DEPRECATED =
+    Kokkos::SpaceAccessibility<AccessSpace, MemorySpace>;
 
-}
+}  // namespace Impl
 }  // namespace Kokkos
+#endif
 
 //----------------------------------------------------------------------------
 
diff --git a/packages/kokkos/core/src/Kokkos_CopyViews.hpp b/packages/kokkos/core/src/Kokkos_CopyViews.hpp
index a27d5f0e47284f7d06b3d9218d1f02bfb679468e..16946dd602b536793f8746165ebd4bb82631742b 100644
--- a/packages/kokkos/core/src/Kokkos_CopyViews.hpp
+++ b/packages/kokkos/core/src/Kokkos_CopyViews.hpp
@@ -47,6 +47,7 @@
 #include <string>
 #include <Kokkos_Parallel.hpp>
 #include <KokkosExp_MDRangePolicy.hpp>
+#include <Kokkos_Layout.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -544,13 +545,11 @@ void view_copy(const ExecutionSpace& space, const DstType& dst,
 
   enum {
     ExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<ExecutionSpace,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<ExecutionSpace, src_memory_space>::accessible
   };
   enum {
     ExecCanAccessDst =
-        Kokkos::Impl::SpaceAccessibility<ExecutionSpace,
-                                         dst_memory_space>::accessible
+        Kokkos::SpaceAccessibility<ExecutionSpace, dst_memory_space>::accessible
   };
 
   if (!(ExecCanAccessSrc && ExecCanAccessDst)) {
@@ -624,14 +623,14 @@ void view_copy(const DstType& dst, const SrcType& src) {
 
   enum {
     DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
   };
 
   enum {
     SrcExecCanAccessDst =
-        Kokkos::Impl::SpaceAccessibility<src_execution_space,
-                                         dst_memory_space>::accessible
+        Kokkos::SpaceAccessibility<src_execution_space,
+                                   dst_memory_space>::accessible
   };
 
   if (!DstExecCanAccessSrc && !SrcExecCanAccessDst) {
@@ -1254,6 +1253,98 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 8> {
   }
 };
 
+template <typename ExecutionSpace, class DT, class... DP>
+inline void contiguous_fill(
+    const ExecutionSpace& exec_space, const View<DT, DP...>& dst,
+    typename ViewTraits<DT, DP...>::const_value_type& value) {
+  using ViewType     = View<DT, DP...>;
+  using ViewTypeFlat = Kokkos::View<
+      typename ViewType::value_type*, Kokkos::LayoutRight,
+      Kokkos::Device<typename ViewType::execution_space,
+                     typename std::conditional<ViewType::Rank == 0,
+                                               typename ViewType::memory_space,
+                                               Kokkos::AnonymousSpace>::type>,
+      Kokkos::MemoryTraits<0>>;
+
+  ViewTypeFlat dst_flat(dst.data(), dst.size());
+  if (dst.span() < static_cast<size_t>(std::numeric_limits<int>::max())) {
+    Kokkos::Impl::ViewFill<ViewTypeFlat, Kokkos::LayoutRight, ExecutionSpace,
+                           ViewTypeFlat::Rank, int>(dst_flat, value,
+                                                    exec_space);
+  } else
+    Kokkos::Impl::ViewFill<ViewTypeFlat, Kokkos::LayoutRight, ExecutionSpace,
+                           ViewTypeFlat::Rank, int64_t>(dst_flat, value,
+                                                        exec_space);
+}
+
+template <typename ExecutionSpace, class DT, class... DP>
+struct ZeroMemset {
+  ZeroMemset(const ExecutionSpace& exec_space, const View<DT, DP...>& dst,
+             typename ViewTraits<DT, DP...>::const_value_type& value) {
+    contiguous_fill(exec_space, dst, value);
+  }
+
+  ZeroMemset(const View<DT, DP...>& dst,
+             typename ViewTraits<DT, DP...>::const_value_type& value) {
+    contiguous_fill(ExecutionSpace(), dst, value);
+  }
+};
+
+template <typename ExecutionSpace, class DT, class... DP>
+inline std::enable_if_t<
+    std::is_trivial<typename ViewTraits<DT, DP...>::const_value_type>::value &&
+    std::is_trivially_copy_assignable<
+        typename ViewTraits<DT, DP...>::const_value_type>::value>
+contiguous_fill_or_memset(
+    const ExecutionSpace& exec_space, const View<DT, DP...>& dst,
+    typename ViewTraits<DT, DP...>::const_value_type& value) {
+  if (Impl::is_zero_byte(value))
+    ZeroMemset<ExecutionSpace, DT, DP...>(exec_space, dst, value);
+  else
+    contiguous_fill(exec_space, dst, value);
+}
+
+template <typename ExecutionSpace, class DT, class... DP>
+inline std::enable_if_t<!(
+    std::is_trivial<typename ViewTraits<DT, DP...>::const_value_type>::value &&
+    std::is_trivially_copy_assignable<
+        typename ViewTraits<DT, DP...>::const_value_type>::value)>
+contiguous_fill_or_memset(
+    const ExecutionSpace& exec_space, const View<DT, DP...>& dst,
+    typename ViewTraits<DT, DP...>::const_value_type& value) {
+  contiguous_fill(exec_space, dst, value);
+}
+
+template <class DT, class... DP>
+inline std::enable_if_t<
+    std::is_trivial<typename ViewTraits<DT, DP...>::const_value_type>::value &&
+    std::is_trivially_copy_assignable<
+        typename ViewTraits<DT, DP...>::const_value_type>::value>
+contiguous_fill_or_memset(
+    const View<DT, DP...>& dst,
+    typename ViewTraits<DT, DP...>::const_value_type& value) {
+  using ViewType        = View<DT, DP...>;
+  using exec_space_type = typename ViewType::execution_space;
+
+  if (Impl::is_zero_byte(value))
+    ZeroMemset<exec_space_type, DT, DP...>(dst, value);
+  else
+    contiguous_fill(exec_space_type(), dst, value);
+}
+
+template <class DT, class... DP>
+inline std::enable_if_t<!(
+    std::is_trivial<typename ViewTraits<DT, DP...>::const_value_type>::value &&
+    std::is_trivially_copy_assignable<
+        typename ViewTraits<DT, DP...>::const_value_type>::value)>
+contiguous_fill_or_memset(
+    const View<DT, DP...>& dst,
+    typename ViewTraits<DT, DP...>::const_value_type& value) {
+  using ViewType        = View<DT, DP...>;
+  using exec_space_type = typename ViewType::execution_space;
+
+  contiguous_fill(exec_space_type(), dst, value);
+}
 }  // namespace Impl
 
 /** \brief  Deep copy a value from Host memory into a view.  */
@@ -1276,38 +1367,23 @@ inline void deep_copy(
   }
 
   if (dst.data() == nullptr) {
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: scalar copy, fence because destination is null");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
     return;
   }
 
-  Kokkos::fence();
+  Kokkos::fence("Kokkos::deep_copy: scalar copy, pre copy fence");
   static_assert(std::is_same<typename ViewType::non_const_value_type,
                              typename ViewType::value_type>::value,
                 "deep_copy requires non-const type");
 
-  // If contiguous we can simply do a 1D flat loop
+  // If contiguous we can simply do a 1D flat loop or use memset
   if (dst.span_is_contiguous()) {
-    using ViewTypeFlat = Kokkos::View<
-        typename ViewType::value_type*, Kokkos::LayoutRight,
-        Kokkos::Device<typename ViewType::execution_space,
-                       typename std::conditional<
-                           ViewType::Rank == 0, typename ViewType::memory_space,
-                           Kokkos::AnonymousSpace>::type>,
-        Kokkos::MemoryTraits<0>>;
-
-    ViewTypeFlat dst_flat(dst.data(), dst.size());
-    if (dst.span() < static_cast<size_t>(std::numeric_limits<int>::max())) {
-      Kokkos::Impl::ViewFill<ViewTypeFlat, Kokkos::LayoutRight, exec_space_type,
-                             ViewTypeFlat::Rank, int>(dst_flat, value,
-                                                      exec_space_type());
-    } else
-      Kokkos::Impl::ViewFill<ViewTypeFlat, Kokkos::LayoutRight, exec_space_type,
-                             ViewTypeFlat::Rank, int64_t>(dst_flat, value,
-                                                          exec_space_type());
-    Kokkos::fence();
+    Impl::contiguous_fill_or_memset(dst, value);
+    Kokkos::fence("Kokkos::deep_copy: scalar copy, post copy fence");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
@@ -1362,7 +1438,7 @@ inline void deep_copy(
                              exec_space_type, ViewType::Rank, int>(
           dst, value, exec_space_type());
   }
-  Kokkos::fence();
+  Kokkos::fence("Kokkos::deep_copy: scalar copy, post copy fence");
 
   if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
     Kokkos::Profiling::endDeepCopy();
@@ -1393,7 +1469,7 @@ inline void deep_copy(
   }
 
   if (src.data() == nullptr) {
-    Kokkos::fence();
+    Kokkos::fence("Kokkos::deep_copy: copy into scalar, src is null");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
@@ -1439,18 +1515,19 @@ inline void deep_copy(
   }
 
   if (dst.data() == nullptr && src.data() == nullptr) {
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: scalar to scalar copy, both pointers null");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
     return;
   }
 
-  Kokkos::fence();
+  Kokkos::fence("Kokkos::deep_copy: scalar to scalar copy, pre copy fence");
   if (dst.data() != src.data()) {
     Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
         dst.data(), src.data(), sizeof(value_type));
-    Kokkos::fence();
+    Kokkos::fence("Kokkos::deep_copy: scalar to scalar copy, post copy fence");
   }
   if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
     Kokkos::Profiling::endDeepCopy();
@@ -1522,7 +1599,9 @@ inline void deep_copy(
 
       Kokkos::Impl::throw_runtime_exception(message);
     }
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: copy between contiguous views, fence due to null "
+        "argument");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
@@ -1531,14 +1610,14 @@ inline void deep_copy(
 
   enum {
     DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
   };
 
   enum {
     SrcExecCanAccessDst =
-        Kokkos::Impl::SpaceAccessibility<src_execution_space,
-                                         dst_memory_space>::accessible
+        Kokkos::SpaceAccessibility<src_execution_space,
+                                   dst_memory_space>::accessible
   };
 
   // Checking for Overlapping Views.
@@ -1549,7 +1628,9 @@ inline void deep_copy(
   if (((std::ptrdiff_t)dst_start == (std::ptrdiff_t)src_start) &&
       ((std::ptrdiff_t)dst_end == (std::ptrdiff_t)src_end) &&
       (dst.span_is_contiguous() && src.span_is_contiguous())) {
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: copy between contiguous views, fence due to same "
+        "spans");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
@@ -1620,16 +1701,22 @@ inline void deep_copy(
       ((dst_type::rank < 7) || (dst.stride_6() == src.stride_6())) &&
       ((dst_type::rank < 8) || (dst.stride_7() == src.stride_7()))) {
     const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: copy between contiguous views, pre view equality "
+        "check");
     if ((void*)dst.data() != (void*)src.data()) {
       Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
           dst.data(), src.data(), nbytes);
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::deep_copy: copy between contiguous views, post deep copy "
+          "fence");
     }
   } else {
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: copy between contiguous views, pre copy fence");
     Impl::view_copy(dst, src);
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: copy between contiguous views, post copy fence");
   }
   if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
     Kokkos::Profiling::endDeepCopy();
@@ -2418,9 +2505,9 @@ inline void deep_copy(
     const ExecSpace& space, const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
     typename std::enable_if<
-        Kokkos::Impl::is_execution_space<ExecSpace>::value &&
+        Kokkos::is_execution_space<ExecSpace>::value &&
         std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value &&
-        Kokkos::Impl::SpaceAccessibility<
+        Kokkos::SpaceAccessibility<
             ExecSpace,
             typename ViewTraits<DT, DP...>::memory_space>::accessible>::type* =
         nullptr) {
@@ -2437,7 +2524,9 @@ inline void deep_copy(
         "(none)", &value, dst.span() * sizeof(typename dst_traits::value_type));
   }
   if (dst.data() == nullptr) {
-    space.fence();
+    space.fence("Kokkos::deep_copy: scalar copy on space, dst data is null");
+  } else if (dst.span_is_contiguous()) {
+    Impl::contiguous_fill_or_memset(space, dst, value);
   } else {
     using ViewTypeUniform = typename std::conditional<
         View<DT, DP...>::Rank == 0,
@@ -2458,9 +2547,9 @@ inline void deep_copy(
     const ExecSpace& space, const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
     typename std::enable_if<
-        Kokkos::Impl::is_execution_space<ExecSpace>::value &&
+        Kokkos::is_execution_space<ExecSpace>::value &&
         std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value &&
-        !Kokkos::Impl::SpaceAccessibility<
+        !Kokkos::SpaceAccessibility<
             ExecSpace,
             typename ViewTraits<DT, DP...>::memory_space>::accessible>::type* =
         nullptr) {
@@ -2477,17 +2566,23 @@ inline void deep_copy(
         "(none)", &value, dst.span() * sizeof(typename dst_traits::value_type));
   }
   if (dst.data() == nullptr) {
-    space.fence();
+    space.fence(
+        "Kokkos::deep_copy: scalar-to-view copy on space, dst data is null");
   } else {
-    space.fence();
-    using ViewTypeUniform = typename std::conditional<
-        View<DT, DP...>::Rank == 0,
-        typename View<DT, DP...>::uniform_runtime_type,
-        typename View<DT, DP...>::uniform_runtime_nomemspace_type>::type;
+    space.fence("Kokkos::deep_copy: scalar-to-view copy on space, pre copy");
     using fill_exec_space = typename dst_traits::memory_space::execution_space;
-    Kokkos::Impl::ViewFill<ViewTypeUniform, typename dst_traits::array_layout,
-                           fill_exec_space>(dst, value, fill_exec_space());
-    fill_exec_space().fence();
+    if (dst.span_is_contiguous()) {
+      Impl::contiguous_fill_or_memset(fill_exec_space(), dst, value);
+    } else {
+      using ViewTypeUniform = typename std::conditional<
+          View<DT, DP...>::Rank == 0,
+          typename View<DT, DP...>::uniform_runtime_type,
+          typename View<DT, DP...>::uniform_runtime_nomemspace_type>::type;
+      Kokkos::Impl::ViewFill<ViewTypeUniform, typename dst_traits::array_layout,
+                             fill_exec_space>(dst, value, fill_exec_space());
+    }
+    fill_exec_space().fence(
+        "Kokkos::deep_copy: scalar-to-view copy on space, fence after fill");
   }
   if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
     Kokkos::Profiling::endDeepCopy();
@@ -2501,7 +2596,7 @@ inline void deep_copy(
     typename ViewTraits<ST, SP...>::non_const_value_type& dst,
     const View<ST, SP...>& src,
     typename std::enable_if<
-        Kokkos::Impl::is_execution_space<ExecSpace>::value &&
+        Kokkos::is_execution_space<ExecSpace>::value &&
         std::is_same<typename ViewTraits<ST, SP...>::specialize,
                      void>::value>::type* = nullptr) {
   using src_traits       = ViewTraits<ST, SP...>;
@@ -2517,7 +2612,8 @@ inline void deep_copy(
   }
 
   if (src.data() == nullptr) {
-    exec_space.fence();
+    exec_space.fence(
+        "Kokkos::deep_copy: view-to-scalar copy on space, src data is null");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
@@ -2538,7 +2634,7 @@ inline void deep_copy(
     const ExecSpace& exec_space, const View<DT, DP...>& dst,
     const View<ST, SP...>& src,
     typename std::enable_if<(
-        Kokkos::Impl::is_execution_space<ExecSpace>::value &&
+        Kokkos::is_execution_space<ExecSpace>::value &&
         std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value &&
         std::is_same<typename ViewTraits<ST, SP...>::specialize, void>::value &&
         (unsigned(ViewTraits<DT, DP...>::rank) == unsigned(0) &&
@@ -2562,7 +2658,8 @@ inline void deep_copy(
   }
 
   if (dst.data() == nullptr && src.data() == nullptr) {
-    exec_space.fence();
+    exec_space.fence(
+        "Kokkos::deep_copy: view-to-view copy on space, data is null");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
@@ -2588,7 +2685,7 @@ inline void deep_copy(
     const ExecSpace& exec_space, const View<DT, DP...>& dst,
     const View<ST, SP...>& src,
     typename std::enable_if<(
-        Kokkos::Impl::is_execution_space<ExecSpace>::value &&
+        Kokkos::is_execution_space<ExecSpace>::value &&
         std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value &&
         std::is_same<typename ViewTraits<ST, SP...>::specialize, void>::value &&
         (unsigned(ViewTraits<DT, DP...>::rank) != 0 ||
@@ -2662,21 +2759,19 @@ inline void deep_copy(
 
   enum {
     ExecCanAccessSrcDst =
-        Kokkos::Impl::SpaceAccessibility<ExecSpace,
-                                         dst_memory_space>::accessible &&
-        Kokkos::Impl::SpaceAccessibility<ExecSpace,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<ExecSpace, dst_memory_space>::accessible &&
+        Kokkos::SpaceAccessibility<ExecSpace, src_memory_space>::accessible
   };
   enum {
     DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
   };
 
   enum {
     SrcExecCanAccessDst =
-        Kokkos::Impl::SpaceAccessibility<src_execution_space,
-                                         dst_memory_space>::accessible
+        Kokkos::SpaceAccessibility<src_execution_space,
+                                   dst_memory_space>::accessible
   };
 
   // Error out for non-identical overlapping views.
@@ -2757,9 +2852,13 @@ inline void deep_copy(
       using cpy_exec_space =
           typename std::conditional<DstExecCanAccessSrc, dst_execution_space,
                                     src_execution_space>::type;
-      exec_space.fence();
+      exec_space.fence(
+          "Kokkos::deep_copy: view-to-view noncontiguous copy on space, pre "
+          "copy");
       Impl::view_copy(cpy_exec_space(), dst, src);
-      cpy_exec_space().fence();
+      cpy_exec_space().fence(
+          "Kokkos::deep_copy: view-to-view noncontiguous copy on space, post "
+          "copy");
     } else {
       Kokkos::Impl::throw_runtime_exception(
           "deep_copy given views that would require a temporary allocation");
@@ -2777,6 +2876,19 @@ inline void deep_copy(
 
 namespace Kokkos {
 
+namespace Impl {
+template <typename ViewType>
+bool size_mismatch(const ViewType& view, unsigned int max_extent,
+                   const size_t new_extents[8]) {
+  for (unsigned int dim = 0; dim < max_extent; ++dim)
+    if (new_extents[dim] != view.extent(dim)) {
+      return true;
+    }
+  return false;
+}
+
+}  // namespace Impl
+
 /** \brief  Resize a view with copying old data to new data at the corresponding
  * indices. */
 template <class T, class... P>
@@ -2798,67 +2910,6 @@ resize(Kokkos::View<T, P...>& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
   static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
                 "Can only resize managed views");
 
-  // Fix #904 by checking dimensions before actually resizing.
-  //
-  // Rank is known at compile time, so hopefully the compiler will
-  // remove branches that are compile-time false.  The upcoming "if
-  // constexpr" language feature would make this certain.
-  if (view_type::Rank == 1 && n0 == static_cast<size_t>(v.extent(0))) {
-    return;
-  }
-  if (view_type::Rank == 2 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1))) {
-    return;
-  }
-  if (view_type::Rank == 3 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2))) {
-    return;
-  }
-  if (view_type::Rank == 4 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3))) {
-    return;
-  }
-  if (view_type::Rank == 5 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4))) {
-    return;
-  }
-  if (view_type::Rank == 6 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4)) &&
-      n5 == static_cast<size_t>(v.extent(5))) {
-    return;
-  }
-  if (view_type::Rank == 7 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4)) &&
-      n5 == static_cast<size_t>(v.extent(5)) &&
-      n6 == static_cast<size_t>(v.extent(6))) {
-    return;
-  }
-  if (view_type::Rank == 8 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4)) &&
-      n5 == static_cast<size_t>(v.extent(5)) &&
-      n6 == static_cast<size_t>(v.extent(6)) &&
-      n7 == static_cast<size_t>(v.extent(7))) {
-    return;
-  }
-  // If Kokkos ever supports Views of rank > 8, the above code won't
-  // be incorrect, because avoiding reallocation in resize() is just
-  // an optimization.
-
   // TODO (mfh 27 Jun 2017) If the old View has enough space but just
   // different dimensions (e.g., if the product of the dimensions,
   // including extra space for alignment, will not change), then
@@ -2866,11 +2917,17 @@ resize(Kokkos::View<T, P...>& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
   // reallocates if any of the dimensions change, even if the old View
   // has enough space.
 
-  view_type v_resized(v.label(), n0, n1, n2, n3, n4, n5, n6, n7);
+  const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
+  const bool sizeMismatch = Impl::size_mismatch(v, v.rank_dynamic, new_extents);
 
-  Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
+  if (sizeMismatch) {
+    view_type v_resized(v.label(), n0, n1, n2, n3, n4, n5, n6, n7);
 
-  v = v_resized;
+    Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
+    Kokkos::fence("Kokkos::resize(View)");
+
+    v = v_resized;
+  }
 }
 
 /** \brief  Resize a view with copying old data to new data at the corresponding
@@ -2895,67 +2952,6 @@ resize(const I& arg_prop, Kokkos::View<T, P...>& v,
   static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
                 "Can only resize managed views");
 
-  // Fix #904 by checking dimensions before actually resizing.
-  //
-  // Rank is known at compile time, so hopefully the compiler will
-  // remove branches that are compile-time false.  The upcoming "if
-  // constexpr" language feature would make this certain.
-  if (view_type::Rank == 1 && n0 == static_cast<size_t>(v.extent(0))) {
-    return;
-  }
-  if (view_type::Rank == 2 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1))) {
-    return;
-  }
-  if (view_type::Rank == 3 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2))) {
-    return;
-  }
-  if (view_type::Rank == 4 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3))) {
-    return;
-  }
-  if (view_type::Rank == 5 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4))) {
-    return;
-  }
-  if (view_type::Rank == 6 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4)) &&
-      n5 == static_cast<size_t>(v.extent(5))) {
-    return;
-  }
-  if (view_type::Rank == 7 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4)) &&
-      n5 == static_cast<size_t>(v.extent(5)) &&
-      n6 == static_cast<size_t>(v.extent(6))) {
-    return;
-  }
-  if (view_type::Rank == 8 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4)) &&
-      n5 == static_cast<size_t>(v.extent(5)) &&
-      n6 == static_cast<size_t>(v.extent(6)) &&
-      n7 == static_cast<size_t>(v.extent(7))) {
-    return;
-  }
-  // If Kokkos ever supports Views of rank > 8, the above code won't
-  // be incorrect, because avoiding reallocation in resize() is just
-  // an optimization.
-
   // TODO (mfh 27 Jun 2017) If the old View has enough space but just
   // different dimensions (e.g., if the product of the dimensions,
   // including extra space for alignment, will not change), then
@@ -2963,19 +2959,64 @@ resize(const I& arg_prop, Kokkos::View<T, P...>& v,
   // reallocates if any of the dimensions change, even if the old View
   // has enough space.
 
-  view_type v_resized(view_alloc(v.label(), std::forward<const I>(arg_prop)),
-                      n0, n1, n2, n3, n4, n5, n6, n7);
+  const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
+  const bool sizeMismatch = Impl::size_mismatch(v, v.rank_dynamic, new_extents);
 
-  Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
+  if (sizeMismatch) {
+    view_type v_resized(view_alloc(v.label(), std::forward<const I>(arg_prop)),
+                        n0, n1, n2, n3, n4, n5, n6, n7);
 
-  v = v_resized;
+    Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
+    // This fence really ought to look for an execution space in
+    // arg_prop, and just fence that if there is one
+    Kokkos::fence("Kokkos::resize(View)");
+
+    v = v_resized;
+  }
 }
 
 /** \brief  Resize a view with copying old data to new data at the corresponding
  * indices. */
 template <class T, class... P>
-inline void resize(Kokkos::View<T, P...>& v,
-                   const typename Kokkos::View<T, P...>::array_layout& layout) {
+inline std::enable_if_t<
+    std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                 Kokkos::LayoutLeft>::value ||
+    std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                 Kokkos::LayoutRight>::value ||
+    std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                 Kokkos::LayoutStride>::value ||
+    is_layouttiled<typename Kokkos::View<T, P...>::array_layout>::value>
+resize(Kokkos::View<T, P...>& v,
+       const typename Kokkos::View<T, P...>::array_layout& layout) {
+  using view_type = Kokkos::View<T, P...>;
+
+  static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
+                "Can only resize managed views");
+
+  if (v.layout() != layout) {
+    view_type v_resized(v.label(), layout);
+
+    Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
+    Kokkos::fence("Kokkos::resize(View)");
+
+    v = v_resized;
+  }
+}
+
+// FIXME User-provided (custom) layouts are not required to have a comparison
+// operator. Hence, there is no way to check if the requested layout is actually
+// the same as the existing one.
+template <class T, class... P>
+inline std::enable_if_t<
+    !(std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                   Kokkos::LayoutLeft>::value ||
+      std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                   Kokkos::LayoutRight>::value ||
+      std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                   Kokkos::LayoutStride>::value ||
+      is_layouttiled<typename Kokkos::View<T, P...>::array_layout>::value)>
+resize(Kokkos::View<T, P...>& v,
+       const typename Kokkos::View<T, P...>::array_layout& layout) {
   using view_type = Kokkos::View<T, P...>;
 
   static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
@@ -3009,10 +3050,16 @@ realloc(Kokkos::View<T, P...>& v,
   static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
                 "Can only realloc managed views");
 
-  const std::string label = v.label();
+  const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
+  const bool sizeMismatch = Impl::size_mismatch(v, v.rank_dynamic, new_extents);
 
-  v = view_type();  // Deallocate first, if the only view to allocation
-  v = view_type(label, n0, n1, n2, n3, n4, n5, n6, n7);
+  if (sizeMismatch) {
+    const std::string label = v.label();
+
+    v = view_type();  // Deallocate first, if the only view to allocation
+    v = view_type(label, n0, n1, n2, n3, n4, n5, n6, n7);
+  } else
+    Kokkos::deep_copy(v, typename view_type::value_type{});
 }
 
 /** \brief  Resize a view with discarding old data. */
@@ -3209,7 +3256,8 @@ create_mirror_view_and_copy(
         Impl::MirrorViewType<Space, T, P...>::is_same_memspace>::type* =
         nullptr) {
   (void)name;
-  fence();  // same behavior as deep_copy(src, src)
+  fence(
+      "Kokkos::create_mirror_view_and_copy: fence before returning src view");  // same behavior as deep_copy(src, src)
   return src;
 }
 
diff --git a/packages/kokkos/core/src/Kokkos_Core.hpp b/packages/kokkos/core/src/Kokkos_Core.hpp
index c3771ab393f3aaf8f77cb474056d90e867ff03da..60e748589df593dbb9e549f6433daea77b5bc6b0 100644
--- a/packages/kokkos/core/src/Kokkos_Core.hpp
+++ b/packages/kokkos/core/src/Kokkos_Core.hpp
@@ -59,6 +59,7 @@
 #include <Kokkos_LogicalSpaces.hpp>
 #include <Kokkos_Pair.hpp>
 #include <Kokkos_MathematicalFunctions.hpp>
+#include <Kokkos_MathematicalSpecialFunctions.hpp>
 #include <Kokkos_MemoryPool.hpp>
 #include <Kokkos_Array.hpp>
 #include <Kokkos_View.hpp>
@@ -74,6 +75,7 @@
 #include <iosfwd>
 #include <map>
 #include <memory>
+#include <vector>
 
 //----------------------------------------------------------------------------
 
@@ -121,6 +123,7 @@ class ExecSpaceManager {
   void initialize_spaces(const Kokkos::InitArguments& args);
   void finalize_spaces(const bool all_spaces);
   void static_fence();
+  void static_fence(const std::string&);
   void print_configuration(std::ostream& msg, const bool detail);
   static ExecSpaceManager& get_instance();
 };
@@ -184,6 +187,7 @@ void push_finalize_hook(std::function<void()> f);
 void finalize_all();
 
 void fence();
+void fence(const std::string&);
 
 /** \brief Print "Bill of Materials" */
 void print_configuration(std::ostream&, const bool detail = false);
@@ -274,6 +278,44 @@ class ScopeGuard {
 
 }  // namespace Kokkos
 
+namespace Kokkos {
+namespace Experimental {
+// Partitioning an Execution Space: expects space and integer arguments for
+// relative weight
+//   Customization point for backends
+//   Default behavior is to return the passed in instance
+template <class ExecSpace, class... Args>
+std::vector<ExecSpace> partition_space(ExecSpace space, Args...) {
+  static_assert(is_execution_space<ExecSpace>::value,
+                "Kokkos Error: partition_space expects an Execution Space as "
+                "first argument");
+#ifdef __cpp_fold_expressions
+  static_assert(
+      (... && std::is_arithmetic_v<Args>),
+      "Kokkos Error: partitioning arguments must be integers or floats");
+#endif
+  std::vector<ExecSpace> instances(sizeof...(Args));
+  for (int s = 0; s < int(sizeof...(Args)); s++) instances[s] = space;
+  return instances;
+}
+
+template <class ExecSpace, class T>
+std::vector<ExecSpace> partition_space(ExecSpace space,
+                                       std::vector<T>& weights) {
+  static_assert(is_execution_space<ExecSpace>::value,
+                "Kokkos Error: partition_space expects an Execution Space as "
+                "first argument");
+  static_assert(
+      std::is_arithmetic<T>::value,
+      "Kokkos Error: partitioning arguments must be integers or floats");
+
+  std::vector<ExecSpace> instances(weights.size());
+  for (int s = 0; s < int(weights.size()); s++) instances[s] = space;
+  return instances;
+}
+}  // namespace Experimental
+}  // namespace Kokkos
+
 #include <Kokkos_Crs.hpp>
 #include <Kokkos_WorkGraphPolicy.hpp>
 // Including this in Kokkos_Parallel_Reduce.hpp led to a circular dependency
diff --git a/packages/kokkos/core/src/Kokkos_Core_fwd.hpp b/packages/kokkos/core/src/Kokkos_Core_fwd.hpp
index fe7eba3f6ef178848d2ea832341014d6dc5d1003..a610ee76dffb6fd23fadae40437b893eaab5cc87 100644
--- a/packages/kokkos/core/src/Kokkos_Core_fwd.hpp
+++ b/packages/kokkos/core/src/Kokkos_Core_fwd.hpp
@@ -53,7 +53,9 @@
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_Utilities.hpp>
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 #include <Kokkos_MasterLock.hpp>
+#endif
 
 //----------------------------------------------------------------------------
 // Have assumed a 64bit build (8byte pointers) throughout the code base.
@@ -238,7 +240,8 @@ class LogicalMemorySpace;
 
 namespace Kokkos {
 void fence();
-}
+void fence(const std::string &);
+}  // namespace Kokkos
 
 //----------------------------------------------------------------------------
 
@@ -250,9 +253,13 @@ class View;
 namespace Impl {
 
 template <class DstSpace, class SrcSpace,
-          class ExecutionSpace = typename DstSpace::execution_space>
+          class ExecutionSpace = typename DstSpace::execution_space,
+          class Enable         = void>
 struct DeepCopy;
 
+template <typename ExecutionSpace, class DT, class... DP>
+struct ZeroMemset;
+
 template <class ViewType, class Layout = typename ViewType::array_layout,
           class ExecSpace = typename ViewType::execution_space,
           int Rank = ViewType::Rank, typename iType = int64_t>
diff --git a/packages/kokkos/core/src/Kokkos_Crs.hpp b/packages/kokkos/core/src/Kokkos_Crs.hpp
index 1a10500b19a55f4f963807dd2cf1a28e6062f98c..897402d37643bf8876360b3e828c685c6251fe19 100644
--- a/packages/kokkos/core/src/Kokkos_Crs.hpp
+++ b/packages/kokkos/core/src/Kokkos_Crs.hpp
@@ -179,7 +179,9 @@ class GetCrsTransposeCounts {
     const closure_type closure(*this,
                                policy_type(0, index_type(in.entries.size())));
     closure.execute();
-    execution_space().fence();
+    execution_space().fence(
+        "Kokkos::Impl::GetCrsTransposeCounts::GetCrsTransposeCounts: fence "
+        "after functor execution");
   }
 };
 
@@ -261,7 +263,9 @@ class FillCrsTransposeEntries {
     using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
     const closure_type closure(*this, policy_type(0, index_type(in.numRows())));
     closure.execute();
-    execution_space().fence();
+    execution_space().fence(
+        "Kokkos::Impl::FillCrsTransposeEntries::FillCrsTransposeEntries: fence "
+        "after functor execution");
   }
 };
 
diff --git a/packages/kokkos/core/src/Kokkos_Cuda.hpp b/packages/kokkos/core/src/Kokkos_Cuda.hpp
index 7a218120bb7bb3b053335946ae25ad58c8a85e6d..c5a6b0f7d7d579e2ccad3c05f97d042d4ed63471 100644
--- a/packages/kokkos/core/src/Kokkos_Cuda.hpp
+++ b/packages/kokkos/core/src/Kokkos_Cuda.hpp
@@ -55,13 +55,13 @@
 
 #include <impl/Kokkos_AnalyzePolicy.hpp>
 #include <Kokkos_CudaSpace.hpp>
+#include <Cuda/Kokkos_Cuda_Error.hpp>  // CUDA_SAFE_CALL
 
 #include <Kokkos_Parallel.hpp>
 #include <Kokkos_TaskScheduler.hpp>
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_MemoryTraits.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
 #include <impl/Kokkos_HostSharedPtr.hpp>
 
@@ -184,8 +184,10 @@ class Cuda {
   /// method does not return until all dispatched functors on this
   /// device have completed.
   static void impl_static_fence();
+  static void impl_static_fence(const std::string&);
 
   void fence() const;
+  void fence(const std::string&) const;
 
   /** \brief  Return the maximum amount of concurrency.  */
   static int concurrency();
@@ -199,7 +201,7 @@ class Cuda {
 
   Cuda();
 
-  Cuda(cudaStream_t stream);
+  Cuda(cudaStream_t stream, bool manage_stream = false);
 
   //--------------------------------------------------------------------------
   //! \name Device-specific functions
@@ -246,7 +248,7 @@ class Cuda {
   inline Impl::CudaInternal* impl_internal_space_instance() const {
     return m_space_instance.get();
   }
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept;
 
  private:
   Kokkos::Impl::HostSharedPtr<Impl::CudaInternal> m_space_instance;
@@ -271,9 +273,28 @@ class CudaSpaceInitializer : public ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool all_spaces) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
+template <class DT, class... DP>
+struct ZeroMemset<Kokkos::Cuda, DT, DP...> {
+  ZeroMemset(const Kokkos::Cuda& exec_space_instance,
+             const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemsetAsync(
+        dst.data(), 0,
+        dst.size() * sizeof(typename View<DT, DP...>::value_type),
+        exec_space_instance.cuda_stream()));
+  }
+
+  ZeroMemset(const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaMemset(dst.data(), 0,
+                   dst.size() * sizeof(typename View<DT, DP...>::value_type)));
+  }
+};
 }  // namespace Impl
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/core/src/Kokkos_CudaSpace.hpp b/packages/kokkos/core/src/Kokkos_CudaSpace.hpp
index e10fae93c7ca01ce90f31b5d22ca9bff7d113884..910a8b2d7470b65b66d397a2507eb96723be447c 100644
--- a/packages/kokkos/core/src/Kokkos_CudaSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_CudaSpace.hpp
@@ -70,6 +70,12 @@ extern "C" void kokkos_impl_cuda_set_pin_uvm_to_host(bool);
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
+namespace Impl {
+
+template <typename T>
+struct is_cuda_type_space : public std::false_type {};
+
+}  // namespace Impl
 
 /** \brief  Cuda on-device memory management */
 
@@ -119,10 +125,12 @@ class CudaSpace {
   /**\brief Return Name of the MemorySpace */
   static constexpr const char* name() { return m_name; }
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   /*--------------------------------*/
   /** \brief  Error reporting for HostSpace attempt to access CudaSpace */
   KOKKOS_DEPRECATED static void access_error();
   KOKKOS_DEPRECATED static void access_error(const void* const);
+#endif
 
  private:
   int m_device;  ///< Which Cuda device
@@ -130,6 +138,10 @@ class CudaSpace {
   static constexpr const char* m_name = "Cuda";
   friend class Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
 };
+
+template <>
+struct Impl::is_cuda_type_space<CudaSpace> : public std::true_type {};
+
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -151,9 +163,11 @@ class CudaUVMSpace {
   /** \brief  If UVM capability is available */
   static bool available();
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   /*--------------------------------*/
   /** \brief  CudaUVMSpace specific routine */
   KOKKOS_DEPRECATED static int number_of_allocations();
+#endif
 
   /*--------------------------------*/
 
@@ -209,6 +223,9 @@ class CudaUVMSpace {
   static constexpr const char* m_name = "CudaUVM";
 };
 
+template <>
+struct Impl::is_cuda_type_space<CudaUVMSpace> : public std::true_type {};
+
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -271,6 +288,9 @@ class CudaHostPinnedSpace {
   /*--------------------------------*/
 };
 
+template <>
+struct Impl::is_cuda_type_space<CudaHostPinnedSpace> : public std::true_type {};
+
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -411,338 +431,107 @@ struct MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, Kokkos::CudaUVMSpace> {
 namespace Kokkos {
 namespace Impl {
 
+void DeepCopyCuda(void* dst, const void* src, size_t n);
+void DeepCopyAsyncCuda(const Cuda& instance, void* dst, const void* src,
+                       size_t n);
 void DeepCopyAsyncCuda(void* dst, const void* src, size_t n);
 
-template <>
-struct DeepCopy<CudaSpace, CudaSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Cuda&, void* dst, const void* src, size_t);
-};
-
-template <>
-struct DeepCopy<CudaSpace, HostSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Cuda&, void* dst, const void* src, size_t);
-};
-
-template <>
-struct DeepCopy<HostSpace, CudaSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Cuda&, void* dst, const void* src, size_t);
-};
-
-template <>
-struct DeepCopy<CudaUVMSpace, CudaUVMSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaUVMSpace, HostSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<HostSpace, CudaUVMSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaHostPinnedSpace, CudaHostPinnedSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaHostPinnedSpace, HostSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<HostSpace, CudaHostPinnedSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaUVMSpace, CudaSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaSpace, CudaUVMSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
+template <class MemSpace>
+struct DeepCopy<MemSpace, HostSpace, Cuda,
+                std::enable_if_t<is_cuda_type_space<MemSpace>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopyCuda(dst, src, n); }
   DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
+    DeepCopyAsyncCuda(instance, dst, src, n);
   }
 };
 
-template <>
-struct DeepCopy<CudaUVMSpace, CudaHostPinnedSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
+template <class MemSpace>
+struct DeepCopy<HostSpace, MemSpace, Cuda,
+                std::enable_if_t<is_cuda_type_space<MemSpace>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopyCuda(dst, src, n); }
   DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
+    DeepCopyAsyncCuda(instance, dst, src, n);
   }
 };
 
-template <>
-struct DeepCopy<CudaHostPinnedSpace, CudaUVMSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
+template <class MemSpace1, class MemSpace2>
+struct DeepCopy<MemSpace1, MemSpace2, Cuda,
+                std::enable_if_t<is_cuda_type_space<MemSpace1>::value &&
+                                 is_cuda_type_space<MemSpace2>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopyCuda(dst, src, n); }
   DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
+    DeepCopyAsyncCuda(instance, dst, src, n);
   }
 };
 
-template <>
-struct DeepCopy<CudaSpace, CudaHostPinnedSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaHostPinnedSpace, CudaSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaSpace, CudaSpace, ExecutionSpace> {
+template <class MemSpace1, class MemSpace2, class ExecutionSpace>
+struct DeepCopy<MemSpace1, MemSpace2, ExecutionSpace,
+                std::enable_if_t<is_cuda_type_space<MemSpace1>::value &&
+                                 is_cuda_type_space<MemSpace2>::value &&
+                                 !std::is_same<ExecutionSpace, Cuda>::value>> {
   inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
+    DeepCopyCuda(dst, src, n);
   }
 
   inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
                   size_t n) {
-    exec.fence();
+    exec.fence(fence_string());
     DeepCopyAsyncCuda(dst, src, n);
   }
-};
 
-template <class ExecutionSpace>
-struct DeepCopy<CudaSpace, HostSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, CudaSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaSpace, CudaUVMSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaSpace, CudaHostPinnedSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaUVMSpace, CudaSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaUVMSpace, CudaUVMSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaUVMSpace, CudaHostPinnedSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaUVMSpace, HostSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaHostPinnedSpace, CudaSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaHostPinnedSpace, CudaUVMSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<") + MemSpace1::name() + "Space, " +
+        MemSpace2::name() +
+        "Space, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
   }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<CudaHostPinnedSpace, CudaHostPinnedSpace, ExecutionSpace> {
+template <class MemSpace, class ExecutionSpace>
+struct DeepCopy<MemSpace, HostSpace, ExecutionSpace,
+                std::enable_if_t<is_cuda_type_space<MemSpace>::value &&
+                                 !std::is_same<ExecutionSpace, Cuda>::value>> {
   inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, HostSpace, Cuda>(dst, src, n);
+    DeepCopyCuda(dst, src, n);
   }
 
   inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
                   size_t n) {
-    exec.fence();
+    exec.fence(fence_string());
     DeepCopyAsyncCuda(dst, src, n);
   }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaHostPinnedSpace, HostSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, HostSpace, Cuda>(dst, src, n);
-  }
 
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<") + MemSpace::name() +
+        "Space, HostSpace, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
   }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, CudaUVMSpace, ExecutionSpace> {
+template <class MemSpace, class ExecutionSpace>
+struct DeepCopy<HostSpace, MemSpace, ExecutionSpace,
+                std::enable_if_t<is_cuda_type_space<MemSpace>::value &&
+                                 !std::is_same<ExecutionSpace, Cuda>::value>> {
   inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n);
+    DeepCopyCuda(dst, src, n);
   }
 
   inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
                   size_t n) {
-    exec.fence();
+    exec.fence(fence_string());
     DeepCopyAsyncCuda(dst, src, n);
   }
-};
 
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, CudaHostPinnedSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, HostSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<HostSpace, ") + MemSpace::name() +
+        "Space, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
   }
 };
 
diff --git a/packages/kokkos/core/src/Kokkos_DetectionIdiom.hpp b/packages/kokkos/core/src/Kokkos_DetectionIdiom.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9e060b343eb77ffd9783b2449def926704032334
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_DetectionIdiom.hpp
@@ -0,0 +1,116 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_DETECTION_IDIOM_HPP
+#define KOKKOS_DETECTION_IDIOM_HPP
+
+#include <impl/Kokkos_Utilities.hpp>  // void_t
+#include <type_traits>
+
+// NOTE This header implements the detection idiom from Version 2 of the C++
+// Extensions for Library Fundamentals, ISO/IEC TS 19568:2017
+
+// I deliberately omitted detected_or which does not fit well with the rest
+// of the specification. In my opinion, it should be removed from the TS.
+
+namespace Kokkos {
+
+namespace Impl {
+// base class for nonesuch to inherit from so it is not an aggregate
+struct nonesuch_base {};
+
+// primary template handles all types not supporting the archetypal Op
+template <class Default, class /*AlwaysVoid*/, template <class...> class Op,
+          class... /*Args*/>
+struct detector {
+  using value_t = std::false_type;
+  using type    = Default;
+};
+
+// specialization recognizes and handles only types supporting Op
+template <class Default, template <class...> class Op, class... Args>
+struct detector<Default, void_t<Op<Args...>>, Op, Args...> {
+  using value_t = std::true_type;
+  using type    = Op<Args...>;
+};
+}  // namespace Impl
+
+struct nonesuch : private Impl::nonesuch_base {
+  ~nonesuch()               = delete;
+  nonesuch(nonesuch const&) = delete;
+  void operator=(nonesuch const&) = delete;
+};
+
+template <template <class...> class Op, class... Args>
+using is_detected =
+    typename Impl::detector<nonesuch, void, Op, Args...>::value_t;
+
+template <template <class...> class Op, class... Args>
+using detected_t = typename Impl::detector<nonesuch, void, Op, Args...>::type;
+
+template <class Default, template <class...> class Op, class... Args>
+using detected_or_t = typename Impl::detector<Default, void, Op, Args...>::type;
+
+template <class Expected, template <class...> class Op, class... Args>
+using is_detected_exact = std::is_same<Expected, detected_t<Op, Args...>>;
+
+template <class To, template <class...> class Op, class... Args>
+using is_detected_convertible =
+    std::is_convertible<detected_t<Op, Args...>, To>;
+
+#ifdef KOKKOS_ENABLE_CXX17
+template <template <class...> class Op, class... Args>
+inline constexpr bool is_detected_v = is_detected<Op, Args...>::value;
+
+template <class Expected, template <class...> class Op, class... Args>
+inline constexpr bool is_detected_exact_v =
+    is_detected_exact<Expected, Op, Args...>::value;
+
+template <class Expected, template <class...> class Op, class... Args>
+inline constexpr bool is_detected_convertible_v =
+    is_detected_convertible<Expected, Op, Args...>::value;
+#endif
+
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp b/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp
index 55aed13670e69838d94fff2735d421cc49a11835..c88c1ada14e38bd4c3cf90c61fc7351cc27fc8ea 100644
--- a/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp
+++ b/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp
@@ -48,7 +48,6 @@
 #include <Kokkos_Core_fwd.hpp>
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_Error.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_AnalyzePolicy.hpp>
 #include <Kokkos_Concepts.hpp>
 #include <typeinfo>
diff --git a/packages/kokkos/core/src/Kokkos_HBWSpace.hpp b/packages/kokkos/core/src/Kokkos_HBWSpace.hpp
index d0366b599cf8c80c92812e386ced90f6fa77eb93..f6cdb2ec46cd4b5329a987e947ec356ff4efb0e9 100644
--- a/packages/kokkos/core/src/Kokkos_HBWSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_HBWSpace.hpp
@@ -287,7 +287,10 @@ struct DeepCopy<Kokkos::Experimental::HBWSpace, Kokkos::Experimental::HBWSpace,
   DeepCopy(void* dst, const void* src, size_t n) { memcpy(dst, src, n); }
 
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<Kokkos::Experimental::HBWSpace, "
+        "Kokkos::Experimental::HBWSpace,ExecutionSpace::DeepCopy: fence before "
+        "copy");
     memcpy(dst, src, n);
   }
 };
@@ -297,7 +300,9 @@ struct DeepCopy<HostSpace, Kokkos::Experimental::HBWSpace, ExecutionSpace> {
   DeepCopy(void* dst, const void* src, size_t n) { memcpy(dst, src, n); }
 
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<HostSpace, Kokkos::Experimental::HBWSpace, "
+        "ExecutionSpace>::DeepCopy: fence before copy");
     memcpy(dst, src, n);
   }
 };
@@ -307,7 +312,9 @@ struct DeepCopy<Kokkos::Experimental::HBWSpace, HostSpace, ExecutionSpace> {
   DeepCopy(void* dst, const void* src, size_t n) { memcpy(dst, src, n); }
 
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<Kokkos::Experimental::HBWSpace, HostSpace, "
+        "ExecutionSpace>::DeepCopy: fence before copy");
     memcpy(dst, src, n);
   }
 };
diff --git a/packages/kokkos/core/src/Kokkos_HIP.hpp b/packages/kokkos/core/src/Kokkos_HIP.hpp
index 33cf8321c80282d5346c66afb5ee9b4be589576b..09df4f2fed4d8c5499ec339391de0730474b1f80 100644
--- a/packages/kokkos/core/src/Kokkos_HIP.hpp
+++ b/packages/kokkos/core/src/Kokkos_HIP.hpp
@@ -54,7 +54,6 @@
 
 #include <Kokkos_HIP_Space.hpp>
 #include <Kokkos_Parallel.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 #include <HIP/Kokkos_HIP_Instance.hpp>
 #include <HIP/Kokkos_HIP_MDRangePolicy.hpp>
diff --git a/packages/kokkos/core/src/Kokkos_HIP_Space.hpp b/packages/kokkos/core/src/Kokkos_HIP_Space.hpp
index 17bd681aa4b7b7aa8d98bb8253c86db81de6ce05..d20d533645b2f6bfc721820cb7a43adf4434f8e8 100644
--- a/packages/kokkos/core/src/Kokkos_HIP_Space.hpp
+++ b/packages/kokkos/core/src/Kokkos_HIP_Space.hpp
@@ -58,6 +58,7 @@
 #include <Kokkos_HostSpace.hpp>
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_ScratchSpace.hpp>
+#include <HIP/Kokkos_HIP_Error.hpp>  // HIP_SAFE_CALL
 
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
@@ -67,6 +68,13 @@
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
+namespace Impl {
+
+template <typename T>
+struct is_hip_type_space : public std::false_type {};
+
+}  // namespace Impl
+
 namespace Experimental {
 /** \brief  HIP on-device memory management */
 
@@ -116,10 +124,12 @@ class HIPSpace {
   /**\brief Return Name of the MemorySpace */
   static constexpr const char* name() { return "HIP"; }
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   /*--------------------------------*/
   /** \brief  Error reporting for HostSpace attempt to access HIPSpace */
   KOKKOS_DEPRECATED static void access_error();
   KOKKOS_DEPRECATED static void access_error(const void* const);
+#endif
 
  private:
   int m_device;  ///< Which HIP device
@@ -129,6 +139,11 @@ class HIPSpace {
 };
 
 }  // namespace Experimental
+
+template <>
+struct Impl::is_hip_type_space<Experimental::HIPSpace> : public std::true_type {
+};
+
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -188,6 +203,11 @@ class HIPHostPinnedSpace {
   /*--------------------------------*/
 };
 }  // namespace Experimental
+
+template <>
+struct Impl::is_hip_type_space<Experimental::HIPHostPinnedSpace>
+    : public std::true_type {};
+
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -268,174 +288,116 @@ struct MemorySpaceAccess<Kokkos::Experimental::HIPHostPinnedSpace,
 namespace Kokkos {
 namespace Impl {
 
+void DeepCopyHIP(void* dst, const void* src, size_t n);
+void DeepCopyAsyncHIP(const Kokkos::Experimental::HIP& instance, void* dst,
+                      const void* src, size_t n);
 void DeepCopyAsyncHIP(void* dst, const void* src, size_t n);
 
-template <>
-struct DeepCopy<Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace,
-                Kokkos::Experimental::HIP> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src,
-           size_t);
-};
-
-template <>
-struct DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace,
-                Kokkos::Experimental::HIP> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src,
-           size_t);
-};
-
-template <>
-struct DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
-                Kokkos::Experimental::HIP> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src,
-           size_t);
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace,
-                ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::HIPSpace,
-                   Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIP>(
-        dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncHIP(dst, src, n);
+template <class MemSpace>
+struct DeepCopy<MemSpace, HostSpace, Kokkos::Experimental::HIP,
+                std::enable_if_t<is_hip_type_space<MemSpace>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopyHIP(dst, src, n); }
+  DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
+           const void* src, size_t n) {
+    DeepCopyAsyncHIP(instance, dst, src, n);
   }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
+template <class MemSpace>
+struct DeepCopy<HostSpace, MemSpace, Kokkos::Experimental::HIP,
+                std::enable_if_t<is_hip_type_space<MemSpace>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopyHIP(dst, src, n); }
+  DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
+           const void* src, size_t n) {
+    DeepCopyAsyncHIP(instance, dst, src, n);
   }
+};
 
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncHIP(dst, src, n);
+template <class MemSpace1, class MemSpace2>
+struct DeepCopy<MemSpace1, MemSpace2, Kokkos::Experimental::HIP,
+                std::enable_if_t<is_hip_type_space<MemSpace1>::value &&
+                                 is_hip_type_space<MemSpace2>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopyHIP(dst, src, n); }
+  DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
+           const void* src, size_t n) {
+    DeepCopyAsyncHIP(instance, dst, src, n);
   }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace, ExecutionSpace> {
+template <class MemSpace1, class MemSpace2, class ExecutionSpace>
+struct DeepCopy<
+    MemSpace1, MemSpace2, ExecutionSpace,
+    std::enable_if_t<
+        is_hip_type_space<MemSpace1>::value &&
+        is_hip_type_space<MemSpace2>::value &&
+        !std::is_same<ExecutionSpace, Kokkos::Experimental::HIP>::value>> {
   inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
+    DeepCopyHIP(dst, src, n);
   }
 
   inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
                   size_t n) {
-    exec.fence();
+    exec.fence(fence_string());
     DeepCopyAsyncHIP(dst, src, n);
   }
-};
-
-template <>
-struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-                Kokkos::Experimental::HIPHostPinnedSpace,
-                Kokkos::Experimental::HIP> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src,
-           size_t);
-};
-
-template <>
-struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace,
-                Kokkos::Experimental::HIP> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src,
-           size_t);
-};
-
-template <>
-struct DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace,
-                Kokkos::Experimental::HIP> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src,
-           size_t);
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HIPSpace,
-                Kokkos::Experimental::HIPHostPinnedSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
-  }
 
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncHIP(dst, src, n);
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<") + MemSpace1::name() + "Space, " +
+        MemSpace2::name() +
+        "Space, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
   }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-                Kokkos::Experimental::HIPSpace, ExecutionSpace> {
+template <class MemSpace, class ExecutionSpace>
+struct DeepCopy<
+    MemSpace, HostSpace, ExecutionSpace,
+    std::enable_if_t<
+        is_hip_type_space<MemSpace>::value &&
+        !std::is_same<ExecutionSpace, Kokkos::Experimental::HIP>::value>> {
   inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
+    DeepCopyHIP(dst, src, n);
   }
 
   inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
                   size_t n) {
-    exec.fence();
+    exec.fence(fence_string());
     DeepCopyAsyncHIP(dst, src, n);
   }
-};
 
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-                Kokkos::Experimental::HIPHostPinnedSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-                   Kokkos::Experimental::HIPHostPinnedSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncHIP(dst, src, n);
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<") + MemSpace::name() +
+        "Space, HostSpace, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
   }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace,
-                ExecutionSpace> {
+template <class MemSpace, class ExecutionSpace>
+struct DeepCopy<
+    HostSpace, MemSpace, ExecutionSpace,
+    std::enable_if_t<
+        is_hip_type_space<MemSpace>::value &&
+        !std::is_same<ExecutionSpace, Kokkos::Experimental::HIP>::value>> {
   inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
+    DeepCopyHIP(dst, src, n);
   }
 
   inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
                   size_t n) {
-    exec.fence();
+    exec.fence(fence_string());
     DeepCopyAsyncHIP(dst, src, n);
   }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace,
-                ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
-  }
 
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncHIP(dst, src, n);
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<HostSpace, ") + MemSpace::name() +
+        "Space, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
   }
 };
 }  // namespace Impl
@@ -536,7 +498,7 @@ class HIP {
   using scratch_memory_space = ScratchMemorySpace<HIP>;
 
   HIP();
-  HIP(hipStream_t stream);
+  HIP(hipStream_t stream, bool manage_stream = false);
 
   //@}
   //------------------------------------
@@ -558,8 +520,10 @@ class HIP {
    * until all dispatched functors on this device have completed.
    */
   static void impl_static_fence();
+  static void impl_static_fence(const std::string&);
 
   void fence() const;
+  void fence(const std::string&) const;
 
   hipStream_t hip_stream() const;
 
@@ -596,7 +560,7 @@ class HIP {
     return m_space_instance.get();
   }
 
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept;
 
  private:
   Kokkos::Impl::HostSharedPtr<Impl::HIPInternal> m_space_instance;
@@ -620,9 +584,28 @@ class HIPSpaceInitializer : public Kokkos::Impl::ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
+template <class DT, class... DP>
+struct ZeroMemset<Kokkos::Experimental::HIP, DT, DP...> {
+  ZeroMemset(const Kokkos::Experimental::HIP& exec_space,
+             const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipMemsetAsync(
+        dst.data(), 0,
+        dst.size() * sizeof(typename View<DT, DP...>::value_type),
+        exec_space.hip_stream()));
+  }
+
+  ZeroMemset(const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    KOKKOS_IMPL_HIP_SAFE_CALL(
+        hipMemset(dst.data(), 0,
+                  dst.size() * sizeof(typename View<DT, DP...>::value_type)));
+  }
+};
 }  // namespace Impl
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/core/src/Kokkos_HPX.hpp b/packages/kokkos/core/src/Kokkos_HPX.hpp
index 2100b49c116cfaecd35205aa60708ed1535578ca..236211864ee8f00f2bce884dc6a16666174bdadf 100644
--- a/packages/kokkos/core/src/Kokkos_HPX.hpp
+++ b/packages/kokkos/core/src/Kokkos_HPX.hpp
@@ -69,7 +69,6 @@
 #include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_FunctorAnalysis.hpp>
 #include <impl/Kokkos_Tools.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_TaskQueue.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
 
@@ -318,25 +317,50 @@ class HPX {
   }
 
   void impl_fence_instance() const {
-    if (hpx::threads::get_self_ptr() == nullptr) {
-      hpx::threads::run_as_hpx_thread([this]() { impl_get_future().wait(); });
-    } else {
-      impl_get_future().wait();
-    }
+    impl_fence_instance(
+        "Kokkos::Experimental::HPX::impl_fence_instance: Unnamed Instance "
+        "Fence");
+  }
+  void impl_fence_instance(const std::string &name) const {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event(name, *this, [&]() {
+      if (hpx::threads::get_self_ptr() == nullptr) {
+        hpx::threads::run_as_hpx_thread([this]() { impl_get_future().wait(); });
+      } else {
+        impl_get_future().wait();
+      }
+    });
   }
 
   void impl_fence_all_instances() const {
-    hpx::util::yield_while(
-        []() { return m_active_parallel_region_count.load() != 0; });
+    impl_fence_instance(
+        "Kokkos::Experimental::HPX::impl_fence_all_instances: Unnamed Global "
+        "HPX Fence");
+  }
+  void impl_fence_all_instances(const std::string &namename) const {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event(name, *this, [&]() {
+      hpx::util::yield_while(
+          []() { return m_active_parallel_region_count.load() != 0; });
+    });
   }
 #endif
 
   void fence() const {
 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
     if (m_mode == instance_mode::global) {
-      impl_fence_all_instances();
+      impl_fence_all_instances(
+          "Kokkos::Experimental::HPX::fence: Unnamed Global HPX Fence");
+    } else {
+      impl_fence_instance(
+          "Kokkos::Experimental::HPX::fence: Unnamed HPX Instance Fence");
+    }
+#endif
+  }
+  void fence(const std::string &name) const {
+#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
+    if (m_mode == instance_mode::global) {
+      impl_fence_all_instances(name);
     } else {
-      impl_fence_instance();
+      impl_fence_instance(name);
     }
 #endif
   }
@@ -464,6 +488,7 @@ class HPXSpaceInitializer : public ExecSpaceInitializerBase {
   void initialize(const InitArguments &args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string &) final;
   void print_configuration(std::ostream &msg, const bool detail) final;
 };
 
@@ -491,7 +516,9 @@ inline void dispatch_execute_task(Closure *closure,
   }
 
   if (force_synchronous) {
-    instance.fence();
+    instance.fence(
+        "Kokkos::Experimental::Impl::HPX::dispatch_execute_task: fence due to "
+        "forced syncronizations");
   }
 }
 #else
diff --git a/packages/kokkos/core/src/Kokkos_HostSpace.hpp b/packages/kokkos/core/src/Kokkos_HostSpace.hpp
index ba69fbad393ee391eff2b59c34d4ae526fa7af29..c96cf5fbbe1b3a07f75da83a2557d2bbe4cb38c0 100644
--- a/packages/kokkos/core/src/Kokkos_HostSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_HostSpace.hpp
@@ -299,6 +299,20 @@ namespace Kokkos {
 
 namespace Impl {
 
+template <class DT, class... DP>
+struct ZeroMemset<typename HostSpace::execution_space, DT, DP...> {
+  ZeroMemset(const typename HostSpace::execution_space&,
+             const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type& value)
+      : ZeroMemset(dst, value) {}
+
+  ZeroMemset(const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    using ValueType = typename View<DT, DP...>::value_type;
+    std::memset(dst.data(), 0, sizeof(ValueType) * dst.size());
+  }
+};
+
 template <class ExecutionSpace>
 struct DeepCopy<HostSpace, HostSpace, ExecutionSpace> {
   DeepCopy(void* dst, const void* src, size_t n) {
@@ -306,9 +320,13 @@ struct DeepCopy<HostSpace, HostSpace, ExecutionSpace> {
   }
 
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<HostSpace, HostSpace, "
+        "ExecutionSpace>::DeepCopy: fence before copy");
     hostspace_parallel_deepcopy(dst, src, n);
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<HostSpace, HostSpace, "
+        "ExecutionSpace>::DeepCopy: fence after copy");
   }
 };
 
diff --git a/packages/kokkos/core/src/Kokkos_Layout.hpp b/packages/kokkos/core/src/Kokkos_Layout.hpp
index 778b4f08109a5b2d617c2ff89298c9e92dbccb61..cfd77ea50fedcb5766ace9feb488c4c0f6238e89 100644
--- a/packages/kokkos/core/src/Kokkos_Layout.hpp
+++ b/packages/kokkos/core/src/Kokkos_Layout.hpp
@@ -50,7 +50,6 @@
 
 #include <cstddef>
 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 namespace Kokkos {
 
@@ -89,6 +88,16 @@ struct LayoutLeft {
                                 size_t N3 = 0, size_t N4 = 0, size_t N5 = 0,
                                 size_t N6 = 0, size_t N7 = 0)
       : dimension{N0, N1, N2, N3, N4, N5, N6, N7} {}
+
+  friend bool operator==(const LayoutLeft& left, const LayoutLeft& right) {
+    for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank)
+      if (left.dimension[rank] != right.dimension[rank]) return false;
+    return true;
+  }
+
+  friend bool operator!=(const LayoutLeft& left, const LayoutLeft& right) {
+    return !(left == right);
+  }
 };
 
 //----------------------------------------------------------------------------
@@ -123,6 +132,16 @@ struct LayoutRight {
                                  size_t N3 = 0, size_t N4 = 0, size_t N5 = 0,
                                  size_t N6 = 0, size_t N7 = 0)
       : dimension{N0, N1, N2, N3, N4, N5, N6, N7} {}
+
+  friend bool operator==(const LayoutRight& left, const LayoutRight& right) {
+    for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank)
+      if (left.dimension[rank] != right.dimension[rank]) return false;
+    return true;
+  }
+
+  friend bool operator!=(const LayoutRight& left, const LayoutRight& right) {
+    return !(left == right);
+  }
 };
 
 //----------------------------------------------------------------------------
@@ -184,6 +203,18 @@ struct LayoutStride {
                                   size_t S7 = 0)
       : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, stride{S0, S1, S2, S3,
                                                           S4, S5, S6, S7} {}
+
+  friend bool operator==(const LayoutStride& left, const LayoutStride& right) {
+    for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank)
+      if (left.dimension[rank] != right.dimension[rank] ||
+          left.stride[rank] != right.stride[rank])
+        return false;
+    return true;
+  }
+
+  friend bool operator!=(const LayoutStride& left, const LayoutStride& right) {
+    return !(left == right);
+  }
 };
 
 // ===================================================================================
@@ -229,18 +260,6 @@ struct LayoutTiled {
   static_assert(IsPowerOfTwo,
                 "LayoutTiled must be given power-of-two tile dimensions");
 
-#if 0
-  static_assert( (Impl::is_integral_power_of_two(ArgN0) ) &&
-                 (Impl::is_integral_power_of_two(ArgN1) ) &&
-                 (Impl::is_integral_power_of_two(ArgN2) || (ArgN2 == 0) ) &&
-                 (Impl::is_integral_power_of_two(ArgN3) || (ArgN3 == 0) ) &&
-                 (Impl::is_integral_power_of_two(ArgN4) || (ArgN4 == 0) ) &&
-                 (Impl::is_integral_power_of_two(ArgN5) || (ArgN5 == 0) ) &&
-                 (Impl::is_integral_power_of_two(ArgN6) || (ArgN6 == 0) ) &&
-                 (Impl::is_integral_power_of_two(ArgN7) || (ArgN7 == 0) )
-               , "LayoutTiled must be given power-of-two tile dimensions" );
-#endif
-
   using array_layout = LayoutTiled<OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3,
                                    ArgN4, ArgN5, ArgN6, ArgN7, IsPowerOfTwo>;
   static constexpr Iterate outer_pattern = OuterP;
@@ -270,6 +289,16 @@ struct LayoutTiled {
                                  size_t argN4 = 0, size_t argN5 = 0,
                                  size_t argN6 = 0, size_t argN7 = 0)
       : dimension{argN0, argN1, argN2, argN3, argN4, argN5, argN6, argN7} {}
+
+  friend bool operator==(const LayoutTiled& left, const LayoutTiled& right) {
+    for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank)
+      if (left.dimension[rank] != right.dimension[rank]) return false;
+    return true;
+  }
+
+  friend bool operator!=(const LayoutTiled& left, const LayoutTiled& right) {
+    return !(left == right);
+  }
 };
 
 }  // namespace Experimental
diff --git a/packages/kokkos/core/src/Kokkos_Macros.hpp b/packages/kokkos/core/src/Kokkos_Macros.hpp
index 0d0185346540bf929b4305d6ad496b2f02e39c69..8d0fd925a27070dcd97160ba25ba19f06d3842b2 100644
--- a/packages/kokkos/core/src/Kokkos_Macros.hpp
+++ b/packages/kokkos/core/src/Kokkos_Macros.hpp
@@ -53,11 +53,12 @@
  *  KOKKOS_ENABLE_HPX                 Kokkos::Experimental::HPX execution space
  *  KOKKOS_ENABLE_OPENMP              Kokkos::OpenMP execution space
  *  KOKKOS_ENABLE_OPENMPTARGET        Kokkos::Experimental::OpenMPTarget
- * execution space KOKKOS_ENABLE_HWLOC               HWLOC library is available.
+ *                                    execution space
+ *  KOKKOS_ENABLE_HIP                 Kokkos::Experimental::HIP execution space
+ *  KOKKOS_ENABLE_SYCL                Kokkos::Experimental::SYCL execution space
+ *  KOKKOS_ENABLE_HWLOC               HWLOC library is available.
  *  KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK  Insert array bounds checks, is expensive!
- *  KOKKOS_ENABLE_MPI                 Negotiate MPI/execution space
- * interactions. KOKKOS_ENABLE_CUDA_UVM            Use CUDA UVM for Cuda memory
- * space.
+ *  KOKKOS_ENABLE_CUDA_UVM            Use CUDA UVM for Cuda memory space.
  */
 
 #ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H
@@ -211,6 +212,11 @@
 #define KOKKOS_ENABLE_PRAGMA_SIMD 1
 #endif
 
+// FIXME Workaround for ICE with intel 17,18,19 in Trilinos
+#if (KOKKOS_COMPILER_INTEL <= 1900)
+#define KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS
+#endif
+
 // FIXME_SYCL
 #if !defined(KOKKOS_ENABLE_SYCL)
 #define KOKKOS_ENABLE_PRAGMA_IVDEP 1
@@ -220,11 +226,19 @@
 #define KOKKOS_MEMORY_ALIGNMENT 64
 #endif
 
+#if defined(_WIN32)
+#define KOKKOS_RESTRICT __restrict
+#else
 #define KOKKOS_RESTRICT __restrict__
+#endif
 
 #ifndef KOKKOS_IMPL_ALIGN_PTR
+#if defined(_WIN32)
+#define KOKKOS_IMPL_ALIGN_PTR(size) __declspec(align_value(size))
+#else
 #define KOKKOS_IMPL_ALIGN_PTR(size) __attribute__((align_value(size)))
 #endif
+#endif
 
 #if (1700 > KOKKOS_COMPILER_INTEL)
 #error "Compiling with Intel version earlier than 17.0 is not supported."
@@ -507,24 +521,44 @@
 #if defined(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE)
 #define KOKKOS_ENABLE_TASKDAG
 #endif
+// FIXME_SYCL Tasks not implemented
 #elif !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_SYCL)
 #define KOKKOS_ENABLE_TASKDAG
 #endif
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#define KOKKOS_IMPL_CUDA_VERSION_9_WORKAROUND
-#if (__CUDA_ARCH__)
-#define KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-#endif
-#endif
-
 #define KOKKOS_INVALID_INDEX (~std::size_t(0))
 
 #define KOKKOS_IMPL_CTOR_DEFAULT_ARG KOKKOS_INVALID_INDEX
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 #define KOKKOS_CONSTEXPR_14 constexpr
-#define KOKKOS_DEPRECATED [[deprecated]]
 #define KOKKOS_DEPRECATED_TRAILING_ATTRIBUTE
+#endif
+
+// Guard intel compiler version <= 1900
+// intel error #2651: attribute does not apply to any entity
+// using <deprecated_type> KOKKOS_DEPRECATED = ...
+#if defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) && !defined(__NVCC__) && \
+    (KOKKOS_COMPILER_INTEL > 1900)
+#define KOKKOS_DEPRECATED [[deprecated]]
+#define KOKKOS_DEPRECATED_WITH_COMMENT(comment) [[deprecated(comment)]]
+#else
+#define KOKKOS_DEPRECATED
+#define KOKKOS_DEPRECATED_WITH_COMMENT(comment)
+#endif
+
+#define KOKKOS_IMPL_STRINGIFY(x) #x
+#define KOKKOS_IMPL_TOSTRING(x) KOKKOS_IMPL_STRINGIFY(x)
+
+#ifdef _MSC_VER
+#define KOKKOS_IMPL_DO_PRAGMA(x) __pragma(x)
+#define KOKKOS_IMPL_WARNING(desc) \
+  KOKKOS_IMPL_DO_PRAGMA(message(  \
+      __FILE__ "(" KOKKOS_IMPL_TOSTRING(__LINE__) ") : warning: " #desc))
+#else
+#define KOKKOS_IMPL_DO_PRAGMA(x) _Pragma(#x)
+#define KOKKOS_IMPL_WARNING(desc) KOKKOS_IMPL_DO_PRAGMA(message(#desc))
+#endif
 
 // DJS 05/28/2019: Bugfix: Issue 2155
 // Use KOKKOS_ENABLE_CUDA_LDG_INTRINSIC to avoid memory leak in RandomAccess
@@ -541,7 +575,7 @@
 
 #if (defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) ||  \
      defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_PGI)) && \
-    !defined(KOKKOS_COMPILER_MSVC)
+    !defined(_WIN32)
 #define KOKKOS_IMPL_ENABLE_STACKTRACE
 #define KOKKOS_IMPL_ENABLE_CXXABI
 #endif
@@ -553,7 +587,8 @@
 #undef __CUDA_ARCH__
 #endif
 
-#if defined(KOKKOS_COMPILER_MSVC) && !defined(KOKKOS_COMPILER_CLANG)
+#if (defined(KOKKOS_COMPILER_MSVC) && !defined(KOKKOS_COMPILER_CLANG)) || \
+    (defined(KOKKOS_COMPILER_INTEL) && defined(_WIN32))
 #define KOKKOS_THREAD_LOCAL __declspec(thread)
 #else
 #define KOKKOS_THREAD_LOCAL __thread
diff --git a/packages/kokkos/core/src/Kokkos_MasterLock.hpp b/packages/kokkos/core/src/Kokkos_MasterLock.hpp
index 3c45e131a0fba6e39f3f97ef2fd67451b9aef76c..cbfbb92660ba9d75f4aadb67196d969902524a4f 100644
--- a/packages/kokkos/core/src/Kokkos_MasterLock.hpp
+++ b/packages/kokkos/core/src/Kokkos_MasterLock.hpp
@@ -47,6 +47,8 @@
 
 #include <Kokkos_Macros.hpp>
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+
 namespace Kokkos {
 namespace Experimental {
 
@@ -72,4 +74,6 @@ class MasterLock;
 }  // namespace Experimental
 }  // namespace Kokkos
 
+#endif
+
 #endif  // KOKKOS_MASTER_LOCK_HPP
diff --git a/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp b/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp
index 50223651e7d189e07cd94f9bf48eb6c5dcaa62d2..50fde82d77a7c37dfa0d5f3d1a565df470f680e0 100644
--- a/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp
+++ b/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp
@@ -55,116 +55,224 @@
 #endif
 
 namespace Kokkos {
+
+namespace Impl {
+template <class T, bool = std::is_integral<T>::value>
+struct promote {
+  using type = double;
+};
+template <class T>
+struct promote<T, false> {};
+template <>
+struct promote<long double> {
+  using type = long double;
+};
+template <>
+struct promote<double> {
+  using type = double;
+};
+template <>
+struct promote<float> {
+  using type = float;
+};
+template <class T>
+using promote_t = typename promote<T>::type;
+template <class T, class U>
+struct promote_2 {
+  using type = decltype(promote_t<T>() + promote_t<U>());
+};
+template <class T, class U>
+using promote_2_t = typename promote_2<T, U>::type;
+}  // namespace Impl
+
 namespace Experimental {
 
 #if defined(KOKKOS_ENABLE_SYCL)
-#define NAMESPACE_MATH_FUNCTIONS sycl
+#define KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE sycl
 #else
-#define NAMESPACE_MATH_FUNCTIONS std
+#define KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE std
 #endif
 
-#define KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, RETURNTYPE, ARGTYPE) \
-  KOKKOS_INLINE_FUNCTION RETURNTYPE FUNC(ARGTYPE x) {                        \
-    using NAMESPACE_MATH_FUNCTIONS::FUNC;                                    \
-    return FUNC(x);                                                          \
+// NOTE long double overloads are not available on the device
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \
+    defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET)
+#else
+#define KOKKOS_IMPL_MATH_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+#endif
+
+#if defined(KOKKOS_IMPL_MATH_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS)
+
+#define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC)                                 \
+  KOKKOS_INLINE_FUNCTION float FUNC(float x) {                                \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  KOKKOS_INLINE_FUNCTION double FUNC(double x) {                              \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  KOKKOS_INLINE_FUNCTION long double FUNC(long double x) {                    \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  KOKKOS_INLINE_FUNCTION float FUNC##f(float x) {                             \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  KOKKOS_INLINE_FUNCTION long double FUNC##l(long double x) {                 \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  template <class T>                                                          \
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_integral<T>::value, double> \
+  FUNC(T x) {                                                                 \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(static_cast<double>(x));                                      \
   }
 
-#define KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, RETURNTYPE)              \
-  template <typename Integer,                                              \
-            typename = std::enable_if_t<std::is_integral<Integer>::value>> \
-  KOKKOS_INLINE_FUNCTION RETURNTYPE FUNC(Integer x) {                      \
-    return Kokkos::Experimental::FUNC(static_cast<double>(x));             \
+#define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC)                              \
+  KOKKOS_INLINE_FUNCTION bool FUNC(float x) {                               \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(x);                                                         \
+  }                                                                         \
+  KOKKOS_INLINE_FUNCTION bool FUNC(double x) {                              \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(x);                                                         \
+  }                                                                         \
+  KOKKOS_INLINE_FUNCTION bool FUNC(long double x) {                         \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(x);                                                         \
+  }                                                                         \
+  template <class T>                                                        \
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_integral<T>::value, bool> \
+  FUNC(T x) {                                                               \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(static_cast<double>(x));                                    \
   }
 
-#define KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, TYPE) \
-  KOKKOS_INLINE_FUNCTION TYPE FUNC(TYPE x, TYPE y) {           \
-    using NAMESPACE_MATH_FUNCTIONS::FUNC;                      \
-    return FUNC(x, y);                                         \
+#define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC)                               \
+  KOKKOS_INLINE_FUNCTION float FUNC(float x, float y) {                      \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
+    return FUNC(x, y);                                                       \
+  }                                                                          \
+  KOKKOS_INLINE_FUNCTION double FUNC(double x, double y) {                   \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
+    return FUNC(x, y);                                                       \
+  }                                                                          \
+  KOKKOS_INLINE_FUNCTION long double FUNC(long double x, long double y) {    \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
+    return FUNC(x, y);                                                       \
+  }                                                                          \
+  KOKKOS_INLINE_FUNCTION float FUNC##f(float x, float y) {                   \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
+    return FUNC(x, y);                                                       \
+  }                                                                          \
+  KOKKOS_INLINE_FUNCTION long double FUNC##l(long double x, long double y) { \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
+    return FUNC(x, y);                                                       \
+  }                                                                          \
+  template <class T1, class T2>                                              \
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_arithmetic<T1>::value &&   \
+                                              std::is_arithmetic<T2>::value, \
+                                          Kokkos::Impl::promote_2_t<T1, T2>> \
+  FUNC(T1 x, T2 y) {                                                         \
+    using Promoted = Kokkos::Impl::promote_2_t<T1, T2>;                      \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
+    return FUNC(static_cast<Promoted>(x), static_cast<Promoted>(y));         \
   }
 
-// NOTE long double overloads are not available on the device
-#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \
-    defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET)
+#else  // long double overloads are not available
 
-#define KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC(FUNC)                         \
-  template <typename Arithmetic1, typename Arithmetic2,                      \
-            typename = std::enable_if_t<                                     \
-                std::is_arithmetic<Arithmetic1>::value &&                    \
-                std::is_arithmetic<Arithmetic2>::value &&                    \
-                !std::is_same<Arithmetic1, long double>::value &&            \
-                !std::is_same<Arithmetic2, long double>::value>>             \
-  KOKKOS_INLINE_FUNCTION double FUNC(Arithmetic1 x, Arithmetic2 y) {         \
-    return Kokkos::Experimental::FUNC(                                       \
-        static_cast<std::conditional_t<std::is_integral<Arithmetic1>::value, \
-                                       double, Arithmetic1>>(x),             \
-        static_cast<std::conditional_t<std::is_integral<Arithmetic2>::value, \
-                                       double, Arithmetic2>>(y));            \
+#define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC)                                 \
+  KOKKOS_INLINE_FUNCTION float FUNC(float x) {                                \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  KOKKOS_INLINE_FUNCTION double FUNC(double x) {                              \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  KOKKOS_INLINE_FUNCTION float FUNC##f(float x) {                             \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  template <class T>                                                          \
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_integral<T>::value, double> \
+  FUNC(T x) {                                                                 \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(static_cast<double>(x));                                      \
   }
 
-#define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC)                     \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, float, float)   \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, double, double) \
-  KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, double)
-
-#define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC)                  \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, float)  \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, double) \
-  KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, bool)
-
-#define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC)             \
-  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, float)  \
-  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, double) \
-  KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC(FUNC)
-
-#define KOKKOS_IMPL_MATH_NAN()                                        \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nanf, float, char const*) \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nan, double, char const*)
-
-#else  // long double overloads are available
-
-#define KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC(FUNC)                         \
-  template <typename Arithmetic1, typename Arithmetic2,                      \
-            typename =                                                       \
-                std::enable_if_t<std::is_arithmetic<Arithmetic1>::value &&   \
-                                 std::is_arithmetic<Arithmetic2>::value>,    \
-            typename Promoted = std::conditional_t<                          \
-                std::is_same<Arithmetic1, long double>::value ||             \
-                    std::is_same<Arithmetic2, long double>::value,           \
-                long double, double>>                                        \
-  KOKKOS_INLINE_FUNCTION Promoted FUNC(Arithmetic1 x, Arithmetic2 y) {       \
-    return Kokkos::Experimental::FUNC(                                       \
-        static_cast<std::conditional_t<std::is_integral<Arithmetic1>::value, \
-                                       double, Arithmetic1>>(x),             \
-        static_cast<std::conditional_t<std::is_integral<Arithmetic2>::value, \
-                                       double, Arithmetic2>>(y));            \
+#define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC)                              \
+  KOKKOS_INLINE_FUNCTION bool FUNC(float x) {                               \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(x);                                                         \
+  }                                                                         \
+  KOKKOS_INLINE_FUNCTION bool FUNC(double x) {                              \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(x);                                                         \
+  }                                                                         \
+  template <class T>                                                        \
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_integral<T>::value, bool> \
+  FUNC(T x) {                                                               \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(static_cast<double>(x));                                    \
   }
 
-#define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC)                               \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, float, float)             \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, double, double)           \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, long double, long double) \
-  KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, double)
-
-#define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC)                       \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, float)       \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, double)      \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, long double) \
-  KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, bool)
-
-#define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC)                  \
-  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, float)       \
-  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, double)      \
-  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, long double) \
-  KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC(FUNC)
-
-#define KOKKOS_IMPL_MATH_NAN()                                        \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nanf, float, char const*) \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nan, double, char const*) \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nanl, long double, char const*)
+#define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC)                          \
+  KOKKOS_INLINE_FUNCTION float FUNC(float x, float y) {                 \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                   \
+    return FUNC(x, y);                                                  \
+  }                                                                     \
+  KOKKOS_INLINE_FUNCTION double FUNC(double x, double y) {              \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                   \
+    return FUNC(x, y);                                                  \
+  }                                                                     \
+  KOKKOS_INLINE_FUNCTION float FUNC##f(float x, float y) {              \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                   \
+    return FUNC(x, y);                                                  \
+  }                                                                     \
+  template <class T1, class T2>                                         \
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<                              \
+      std::is_arithmetic<T1>::value && std::is_arithmetic<T2>::value && \
+          !std::is_same<T1, long double>::value &&                      \
+          !std::is_same<T2, long double>::value,                        \
+      Kokkos::Impl::promote_2_t<T1, T2>>                                \
+  FUNC(T1 x, T2 y) {                                                    \
+    using Promoted = Kokkos::Impl::promote_2_t<T1, T2>;                 \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                   \
+    return FUNC(static_cast<Promoted>(x), static_cast<Promoted>(y));    \
+  }
 
 #endif
 
 // Basic operations
+KOKKOS_INLINE_FUNCTION int abs(int n) {
+  using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
+  return abs(n);
+}
+KOKKOS_INLINE_FUNCTION long abs(long n) {
+  using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
+  return abs(n);
+}
+KOKKOS_INLINE_FUNCTION long long abs(long long n) {
+  using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
+  return abs(n);
+}
+KOKKOS_INLINE_FUNCTION float abs(float x) {
+  using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
+  return abs(x);
+}
+KOKKOS_INLINE_FUNCTION double abs(double x) {
+  using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
+  return abs(x);
+}
+#if defined(KOKKOS_IMPL_MATH_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS)
+KOKKOS_INLINE_FUNCTION long double abs(long double x) {
+  using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
+  return abs(x);
+}
+#endif
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(fabs)
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmod)
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(remainder)
@@ -172,7 +280,18 @@ KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmin)
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmax)
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(fdim)
 #ifndef KOKKOS_ENABLE_SYCL
-KOKKOS_IMPL_MATH_NAN()
+KOKKOS_INLINE_FUNCTION float nanf(char const* arg) { return ::nanf(arg); }
+KOKKOS_INLINE_FUNCTION double nan(char const* arg) { return ::nan(arg); }
+#if defined(KOKKOS_IMPL_MATH_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS)
+KOKKOS_INLINE_FUNCTION long double nanl(char const* arg) { return ::nanl(arg); }
+#endif
+#else
+// FIXME_SYCL
+// sycl::nan does not follow the C/C++ standard library and takes an unsigned
+// integer as argument.  The current implementation does not attempt to convert
+// the character string arg into the quiet NaN value.
+KOKKOS_INLINE_FUNCTION float nanf(char const*) { return sycl::nan(0u); }
+KOKKOS_INLINE_FUNCTION double nan(char const*) { return sycl::nan(0ul); }
 #endif
 // Power functions
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(pow)
@@ -211,6 +330,7 @@ KOKKOS_IMPL_MATH_UNARY_FUNCTION(lgamma)
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(ceil)
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(floor)
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(trunc)
+// FIXME_SYCL not available as of current SYCL specification v1.2.1
 #ifndef KOKKOS_ENABLE_SYCL
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(nearbyint)
 #endif
@@ -219,14 +339,12 @@ KOKKOS_IMPL_MATH_UNARY_PREDICATE(isfinite)
 KOKKOS_IMPL_MATH_UNARY_PREDICATE(isinf)
 KOKKOS_IMPL_MATH_UNARY_PREDICATE(isnan)
 
-#undef KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT
-#undef KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL
-#undef KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT
-#undef KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC
+#undef KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE
+#undef KOKKOS_IMPL_MATH_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
 #undef KOKKOS_IMPL_MATH_UNARY_FUNCTION
 #undef KOKKOS_IMPL_MATH_UNARY_PREDICATE
 #undef KOKKOS_IMPL_MATH_BINARY_FUNCTION
-#undef KOKKOS_IMPL_MATH_NAN
+
 }  // namespace Experimental
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/core/src/Kokkos_MathematicalSpecialFunctions.hpp b/packages/kokkos/core/src/Kokkos_MathematicalSpecialFunctions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7bcea91c86790dd52265addb1ca651adbe21a966
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_MathematicalSpecialFunctions.hpp
@@ -0,0 +1,1280 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_MATHEMATICAL_SPECIAL_FUNCTIONS_HPP
+#define KOKKOS_MATHEMATICAL_SPECIAL_FUNCTIONS_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <cmath>
+#include <algorithm>
+#include <type_traits>
+#include <Kokkos_MathematicalFunctions.hpp>
+#include <Kokkos_NumericTraits.hpp>
+#include <Kokkos_Complex.hpp>
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+namespace Kokkos {
+namespace Experimental {
+
+//! Compute exponential integral E1(x) (x > 0).
+template <class RealType>
+KOKKOS_INLINE_FUNCTION RealType expint1(RealType x) {
+  // This function is a conversion of the corresponding Fortran program in
+  // S. Zhang & J. Jin "Computation of Special Functions" (Wiley, 1996).
+  using Kokkos::Experimental::epsilon;
+  using Kokkos::Experimental::exp;
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::log;
+  using Kokkos::Experimental::pow;
+
+  RealType e1;
+
+  if (x < 0) {
+    e1 = -infinity<RealType>::value;
+  } else if (x == 0.0) {
+    e1 = infinity<RealType>::value;
+  } else if (x <= 1.0) {
+    e1         = 1.0;
+    RealType r = 1.0;
+    for (int k = 1; k <= 25; k++) {
+      RealType k_real = static_cast<RealType>(k);
+      r               = -r * k_real * x / pow(k_real + 1.0, 2.0);
+      e1              = e1 + r;
+      if (fabs(r) <= fabs(e1) * epsilon<RealType>::value) break;
+    }
+    e1 = -0.5772156649015328 - log(x) + x * e1;
+  } else {
+    int m       = 20 + static_cast<int>(80.0 / x);
+    RealType t0 = 0.0;
+    for (int k = m; k >= 1; k--) {
+      RealType k_real = static_cast<RealType>(k);
+      t0              = k_real / (1.0 + k_real / (x + t0));
+    }
+    e1 = exp(-x) * (1.0 / (x + t0));
+  }
+  return e1;
+}
+
+//! Compute error function erf(z) for z=cmplx(x,y).
+template <class RealType>
+KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> erf(
+    const Kokkos::complex<RealType>& z) {
+  // This function is a conversion of the corresponding Fortran program written
+  // by D.E. Amos, May,1974. D.E. Amos' revisions of Jan 86 incorporated by
+  // Ken Damrau on 27-Jan-1986 14:37:13
+  //
+  // Reference: NBS HANDBOOK OF MATHEMATICAL FUNCTIONS, AMS 55, By
+  //           M. ABRAMOWITZ AND I.A. STEGUN, December,1955.
+  // Summary:
+  //  If x < 0, z is replaced by -z and all computation is done in the right
+  //  half lane, except for z inside the circle abs(z)<=2, since
+  //  erf(-z)=-erf(z). The regions for computation are divided as follows
+  //      (1)  abs(z)<=2 - Power series, NBS Handbook, p. 298
+  //      (2)  abs(z)>2 and x>1 - continued fraction, NBS Handbook, p. 298
+  //      (3)  abs(z)>2 and 0<=x<=1 and abs(y)<6 - series, NBS Handbook, p. 299
+  //      (4)  abs(z)>2 and 0<=x<=1 and abs(y)>=6 - asymtotic expansion
+  //  Error condition: abs(z^2) > 670 is a fatal overflow error
+  using Kokkos::Experimental::cos;
+  using Kokkos::Experimental::epsilon;
+  using Kokkos::Experimental::exp;
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::sin;
+
+  using CmplxType = Kokkos::complex<RealType>;
+
+  auto const inf = infinity<RealType>::value;
+  auto const tol = epsilon<RealType>::value;
+
+  const RealType fnorm = 1.12837916709551;
+  const RealType gnorm = 0.564189583547756;
+  const RealType eh    = 0.606530659712633;
+  const RealType ef    = 0.778800783071405;
+  // const RealType tol   = 1.0e-13;
+  const RealType pi = M_PI;
+
+  CmplxType cans;
+
+  RealType az = Kokkos::abs(z);
+  if (az <= 2.0) {  // Series for abs(z)<=2.0
+    CmplxType cz    = z * z;
+    CmplxType accum = CmplxType(1.0, 0.0);
+    CmplxType term  = accum;
+    RealType ak     = 1.5;
+    for (int i = 1; i <= 35; i++) {
+      term  = term * cz / ak;
+      accum = accum + term;
+      if (Kokkos::abs(term) <= tol) break;
+      ak = ak + 1.0;
+    }
+    cz          = -cz;
+    RealType er = cz.real();
+    RealType ei = cz.imag();
+    accum       = accum * z * fnorm;
+    cz          = exp(er) * CmplxType(cos(ei), sin(ei));
+    cans        = accum * cz;
+  }       // end (az <= 2.0)
+  else {  //(az > 2.0)
+    CmplxType zp = z;
+    if (z.real() < 0.0) zp = -z;
+    CmplxType cz = zp * zp;
+    RealType xp  = zp.real();
+    RealType yp  = zp.imag();
+    if (xp > 1.0) {
+      // continued fraction for erfc(z), abs(Z)>2
+      int n          = static_cast<int>(100.0 / az + 5.0);
+      int fn         = n;
+      CmplxType term = cz;
+      for (int i = 1; i <= n; i++) {
+        RealType fnh = fn - 0.5;
+        term         = cz + (fnh * term) / (fn + term);
+        fn           = fn - 1;
+      }
+      if (Kokkos::abs(cz) > 670.0) return CmplxType(inf, inf);
+      cz              = -cz;
+      RealType er     = cz.real();
+      RealType ei     = cz.imag();
+      cz              = exp(er) * CmplxType(cos(ei), sin(ei));
+      CmplxType accum = zp * gnorm * cz;
+      cans            = 1.0 - accum / term;
+      if (z.real() < 0.0) cans = -cans;
+    }       // end (xp > 1.0)
+    else {  //(xp <= 1.0)
+      if (fabs(yp) <
+          6.0) {  // Series (3) for abs(z)>2 and 0<=xp<=1 and abs(yp)<6
+        RealType s1   = 0.0;
+        RealType s2   = 0.0;
+        RealType x2   = xp * xp;
+        RealType fx2  = 4.0 * x2;
+        RealType tx   = xp + xp;
+        RealType xy   = xp * yp;
+        RealType sxyh = sin(xy);
+        RealType sxy  = sin(xy + xy);
+        RealType cxy  = cos(xy + xy);
+        RealType fn   = 1.0;
+        RealType fnh  = 0.5;
+        RealType ey   = exp(yp);
+        RealType en   = ey;
+        RealType ehn  = eh;
+        RealType un   = ef;
+        RealType vn   = 1.0;
+        for (int i = 1; i <= 50; i++) {
+          RealType ren = 1.0 / en;
+          RealType csh = en + ren;
+          RealType tm  = xp * csh;
+          RealType ssh = en - ren;
+          RealType tmp = fnh * ssh;
+          RealType rn  = tx - tm * cxy + tmp * sxy;
+          RealType ain = tm * sxy + tmp * cxy;
+          RealType cf  = un / (vn + fx2);
+          rn           = cf * rn;
+          ain          = cf * ain;
+          s1           = s1 + rn;
+          s2           = s2 + ain;
+          if ((fabs(rn) + fabs(ain)) < tol * (fabs(s1) + fabs(s2))) break;
+          un  = un * ehn * ef;
+          ehn = ehn * eh;
+          en  = en * ey;
+          vn  = vn + fn + fn + 1.0;
+          fnh = fnh + 0.5;
+          fn  = fn + 1.0;
+        }
+        s1 = s1 + s1;
+        s2 = s2 + s2;
+        if (z.real() == 0.0)
+          s2 = s2 + yp;
+        else {
+          s1 = s1 + sxyh * sxyh / xp;
+          s2 = s2 + sxy / tx;
+        }
+        // Power series for erf(xp), 0<=xp<=1
+        RealType w  = 1.0;
+        RealType ak = 1.5;
+        RealType tm = 1.0;
+        for (int i = 1; i <= 17; i++) {
+          tm = tm * x2 / ak;
+          w  = w + tm;
+          if (tm <= tol) break;
+          ak = ak + 1.0;
+        }
+        RealType ex = exp(-x2);
+        w           = w * xp * fnorm * ex;
+        RealType cf = ex / pi;
+        s1          = cf * s1 + w;
+        s2          = cf * s2;
+        cans        = CmplxType(s1, s2);
+        if (z.real() < 0.0) cans = -cans;
+      }       // end (abs(yp) < 6.0)
+      else {  //(abs(YP)>=6.0)
+        // Asymtotic expansion for 0<=xp<=1 and abs(yp)>=6
+        CmplxType rcz   = 0.5 / cz;
+        CmplxType accum = CmplxType(1.0, 0.0);
+        CmplxType term  = accum;
+        RealType ak     = 1.0;
+        for (int i = 1; i <= 35; i++) {
+          term  = -term * ak * rcz;
+          accum = accum + term;
+          if (Kokkos::abs(term) / Kokkos::abs(accum) <= tol) break;
+          ak = ak + 2.0;
+        }
+        accum       = accum * gnorm / zp;
+        cz          = -cz;
+        RealType er = cz.real();
+        if (fabs(er) > 670.0) return CmplxType(inf, inf);
+        RealType ei = cz.imag();
+        cz          = exp(er) * CmplxType(cos(ei), sin(ei));
+        cans        = 1.0 - accum * cz;
+        if (z.real() < 0.0) cans = -cans;
+      }  // end (abs(YP)>=6.0)
+    }    // end (xp <= 1.0)
+  }      // end (az > 2.0)
+  return cans;
+}
+
+//! Compute scaled complementary error function erfcx(z)=exp(z^2)*erfc(z)
+//! for z=cmplx(x,y).
+template <class RealType>
+KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> erfcx(
+    const Kokkos::complex<RealType>& z) {
+  // This function is a conversion of the corresponding Fortran program written
+  // by D.E. Amos, May,1974. D.E. Amos' revisions of Jan 86 incorporated by
+  // Ken Damrau on 27-Jan-1986 14:37:13
+  //
+  // Reference: NBS HANDBOOK OF MATHEMATICAL FUNCTIONS, AMS 55, By
+  //           M. ABRAMOWITZ AND I.A. STEGUN, December,1955.
+  // Summary:
+  //  If x < 0, z is replaced by -z and all computation is done in the right
+  //  half lane, except for z inside the circle abs(z)<=2, since
+  //  erfc(-z)=2-erfc(z). The regions for computation are divided as follows
+  //      (1)  abs(z)<=2 - Power series, NBS Handbook, p. 298
+  //      (2)  abs(z)>2 and x>1 - continued fraction, NBS Handbook, p. 298
+  //      (3)  abs(z)>2 and 0<=x<=1 and abs(y)<6 - series, NBS Handbook, p. 299
+  //      (4)  abs(z)>2 and 0<=x<=1 and abs(y)>=6 - asymtotic expansion
+  // Error condition: abs(z^2) > 670 is a fatal overflow error when x<0
+  using Kokkos::Experimental::cos;
+  using Kokkos::Experimental::epsilon;
+  using Kokkos::Experimental::exp;
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::isinf;
+  using Kokkos::Experimental::sin;
+
+  using CmplxType = Kokkos::complex<RealType>;
+
+  auto const inf = infinity<RealType>::value;
+  auto const tol = epsilon<RealType>::value;
+
+  const RealType fnorm = 1.12837916709551;
+  const RealType gnorm = 0.564189583547756;
+  const RealType eh    = 0.606530659712633;
+  const RealType ef    = 0.778800783071405;
+  // const RealType tol   = 1.0e-13;
+  const RealType pi = M_PI;
+
+  CmplxType cans;
+
+  if ((isinf(z.real())) && (z.real() > 0)) {
+    cans = CmplxType(0.0, 0.0);
+    return cans;
+  }
+  if ((isinf(z.real())) && (z.real() < 0)) {
+    cans = CmplxType(inf, inf);
+    return cans;
+  }
+
+  RealType az = Kokkos::abs(z);
+  if (az <= 2.0) {  // Series for abs(z)<=2.0
+    CmplxType cz    = z * z;
+    CmplxType accum = CmplxType(1.0, 0.0);
+    CmplxType term  = accum;
+    RealType ak     = 1.5;
+    for (int i = 1; i <= 35; i++) {
+      term  = term * cz / ak;
+      accum = accum + term;
+      if (Kokkos::abs(term) <= tol) break;
+      ak = ak + 1.0;
+    }
+    cz          = -cz;
+    RealType er = cz.real();
+    RealType ei = cz.imag();
+    accum       = accum * z * fnorm;
+    cz          = exp(er) * CmplxType(cos(ei), sin(ei));
+    cans        = 1.0 / cz - accum;
+  }       // end (az <= 2.0)
+  else {  //(az > 2.0)
+    CmplxType zp = z;
+    if (z.real() < 0.0) zp = -z;
+    CmplxType cz = zp * zp;
+    RealType xp  = zp.real();
+    RealType yp  = zp.imag();
+    if (xp > 1.0) {
+      // continued fraction for erfc(z), abs(z)>2
+      int n          = static_cast<int>(100.0 / az + 5.0);
+      int fn         = n;
+      CmplxType term = cz;
+      for (int i = 1; i <= n; i++) {
+        RealType fnh = fn - 0.5;
+        term         = cz + (fnh * term) / (fn + term);
+        fn           = fn - 1;
+      }
+      cans = zp * gnorm / term;
+      if (z.real() >= 0.0) return cans;
+      if (Kokkos::abs(cz) > 670.0) return CmplxType(inf, inf);
+      ;
+      cz          = -cz;
+      RealType er = cz.real();
+      RealType ei = cz.imag();
+      cz          = exp(er) * CmplxType(cos(ei), sin(ei));
+      cz          = 1.0 / cz;
+      cans        = cz + cz - cans;
+    }       // end (xp > 1.0)
+    else {  //(xp <= 1.0)
+      if (fabs(yp) <
+          6.0) {  // Series (3) for abs(z)>2 and 0<=xp<=1 and abs(yp)<6
+        RealType s1   = 0.0;
+        RealType s2   = 0.0;
+        RealType x2   = xp * xp;
+        RealType fx2  = 4.0 * x2;
+        RealType tx   = xp + xp;
+        RealType xy   = xp * yp;
+        RealType sxyh = sin(xy);
+        RealType sxy  = sin(xy + xy);
+        RealType cxy  = cos(xy + xy);
+        RealType fn   = 1.0;
+        RealType fnh  = 0.5;
+        RealType ey   = exp(yp);
+        RealType en   = ey;
+        RealType ehn  = eh;
+        RealType un   = ef;
+        RealType vn   = 1.0;
+        for (int i = 1; i <= 50; i++) {
+          RealType ren = 1.0 / en;
+          RealType csh = en + ren;
+          RealType tm  = xp * csh;
+          RealType ssh = en - ren;
+          RealType tmp = fnh * ssh;
+          RealType rn  = tx - tm * cxy + tmp * sxy;
+          RealType ain = tm * sxy + tmp * cxy;
+          RealType cf  = un / (vn + fx2);
+          rn           = cf * rn;
+          ain          = cf * ain;
+          s1           = s1 + rn;
+          s2           = s2 + ain;
+          if ((fabs(rn) + fabs(ain)) < tol * (fabs(s1) + fabs(s2))) break;
+          un  = un * ehn * ef;
+          ehn = ehn * eh;
+          en  = en * ey;
+          vn  = vn + fn + fn + 1.0;
+          fnh = fnh + 0.5;
+          fn  = fn + 1.0;
+        }
+        s1 = s1 + s1;
+        s2 = s2 + s2;
+        if (z.real() == 0.0)
+          s2 = s2 + yp;
+        else {
+          s1 = s1 + sxyh * sxyh / xp;
+          s2 = s2 + sxy / tx;
+        }
+        // Power series for erf(xp), 0<=xp<=1
+        RealType w  = 1.0;
+        RealType ak = 1.5;
+        RealType tm = 1.0;
+        for (int i = 1; i <= 17; i++) {
+          tm = tm * x2 / ak;
+          w  = w + tm;
+          if (tm <= tol) break;
+          ak = ak + 1.0;
+        }
+        RealType ex   = exp(-x2);
+        w             = w * xp * fnorm * ex;
+        CmplxType rcz = CmplxType(cxy, sxy);
+        RealType y2   = yp * yp;
+        cz            = exp(x2 - y2) * rcz;
+        rcz           = exp(-y2) * rcz;
+        if (z.real() >= 0.0)
+          cans = cz * (1.0 - w) - rcz * CmplxType(s1, s2) / pi;
+        else
+          cans = cz * (1.0 + w) + rcz * CmplxType(s1, s2) / pi;
+      }       // end (abs(yp) < 6.0)
+      else {  //(abs(YP)>=6.0)
+        // Asymtotic expansion for 0<=xp<=1 and abs(yp)>=6
+        CmplxType rcz   = 0.5 / cz;
+        CmplxType accum = CmplxType(1.0, 0.0);
+        CmplxType term  = accum;
+        RealType ak     = 1.0;
+        for (int i = 1; i <= 35; i++) {
+          term  = -term * ak * rcz;
+          accum = accum + term;
+          if (Kokkos::abs(term) / Kokkos::abs(accum) <= tol) break;
+          ak = ak + 2.0;
+        }
+        accum = accum * gnorm / zp;
+        if (z.real() < 0.0) accum = -accum;
+        cans = accum;
+      }  // end (abs(YP)>=6.0)
+    }    // end (xp <= 1.0)
+  }      // end (az > 2.0)
+  return cans;
+}
+
+//! Compute scaled complementary error function erfcx(x)=exp(x^2)*erfc(x)
+//! for real x
+template <class RealType>
+KOKKOS_INLINE_FUNCTION RealType erfcx(RealType x) {
+  using CmplxType = Kokkos::complex<RealType>;
+  // Note: using erfcx(complex) for now
+  // TODO: replace with an implementation of erfcx(real)
+  CmplxType zin  = CmplxType(x, 0.0);
+  CmplxType zout = erfcx(zin);
+  return zout.real();
+}
+
+//! Compute Bessel function J0(z) of the first kind of order zero
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_j0(const CmplxType& z,
+                                               const RealType& joint_val = 25,
+                                               const IntType& bw_start   = 70) {
+  // This function is converted and modified from the corresponding Fortran
+  // program CJYNB in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  // Input :  z         --- Complex argument
+  //         joint_val --- Joint point of abs(z) separating small and large
+  //                       argument regions
+  //         bw_start  --- Starting point for backward recurrence
+  // Output:  cbj0      --- J0(z)
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::pow;
+
+  CmplxType cbj0;
+  const RealType pi    = M_PI;
+  const RealType a[12] = {
+      -0.703125e-01,           0.112152099609375e+00,   -0.5725014209747314e+00,
+      0.6074042001273483e+01,  -0.1100171402692467e+03, 0.3038090510922384e+04,
+      -0.1188384262567832e+06, 0.6252951493434797e+07,  -0.4259392165047669e+09,
+      0.3646840080706556e+11,  -0.3833534661393944e+13, 0.4854014686852901e+15};
+  const RealType b[12] = {0.732421875e-01,        -0.2271080017089844e+00,
+                          0.1727727502584457e+01, -0.2438052969955606e+02,
+                          0.5513358961220206e+03, -0.1825775547429318e+05,
+                          0.8328593040162893e+06, -0.5006958953198893e+08,
+                          0.3836255180230433e+10, -0.3649010818849833e+12,
+                          0.4218971570284096e+14, -0.5827244631566907e+16};
+
+  RealType r2p = 2.0 / pi;
+  RealType a0  = Kokkos::abs(z);
+  RealType y0  = fabs(z.imag());
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cbj0 = CmplxType(1.0, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:25)
+      CmplxType cbs = CmplxType(0.0, 0.0);
+      CmplxType csu = CmplxType(0.0, 0.0);
+      CmplxType csv = CmplxType(0.0, 0.0);
+      CmplxType cf2 = CmplxType(0.0, 0.0);
+      CmplxType cf1 = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 70)
+        cf                    = 2.0 * (k + 1.0) / z * cf1 - cf2;
+        RealType tmp_exponent = static_cast<RealType>(k / 2);
+        if (k == 0) cbj0 = cf;
+        if ((k == 2 * (k / 2)) && (k != 0)) {
+          if (y0 <= 1.0)
+            cbs = cbs + 2.0 * cf;
+          else
+            cbs = cbs + pow(-1.0, tmp_exponent) * 2.0 * cf;
+          csu = csu + pow(-1.0, tmp_exponent) * cf / k;
+        } else if (k > 1) {
+          csv = csv + pow(-1.0, tmp_exponent) * k / (k * k - 1.0) * cf;
+        }
+        cf2 = cf1;
+        cf1 = cf;
+      }
+      if (y0 <= 1.0)
+        cs0 = cbs + cf;
+      else
+        cs0 = (cbs + cf) / Kokkos::cos(z);
+      cbj0 = cbj0 / cs0;
+    } else {  // Using asymptotic expansion (5.2.5) for |z|>joint_val
+              // (default:25)
+      CmplxType ct1 = z1 - 0.25 * pi;
+      CmplxType cp0 = CmplxType(1.0, 0.0);
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.9)
+        cp0 = cp0 + a[k - 1] * Kokkos::pow(z1, -2.0 * k);
+      }
+      CmplxType cq0 = -0.125 / z1;
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.10)
+        cq0 = cq0 + b[k - 1] * Kokkos::pow(z1, -2.0 * k - 1);
+      }
+      CmplxType cu = Kokkos::sqrt(r2p / z1);
+      cbj0         = cu * (cp0 * Kokkos::cos(ct1) - cq0 * Kokkos::sin(ct1));
+    }
+  }
+  return cbj0;
+}
+
+//! Compute Bessel function Y0(z) of the second kind of order zero
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_y0(const CmplxType& z,
+                                               const RealType& joint_val = 25,
+                                               const IntType& bw_start   = 70) {
+  // This function is converted and modified from the corresponding Fortran
+  // program CJYNB in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cby0      --- Y0(z)
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::pow;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType cby0, cbj0;
+  const RealType pi    = M_PI;
+  const RealType el    = 0.57721566490153286060651209008240;
+  const RealType a[12] = {
+      -0.703125e-01,           0.112152099609375e+00,   -0.5725014209747314e+00,
+      0.6074042001273483e+01,  -0.1100171402692467e+03, 0.3038090510922384e+04,
+      -0.1188384262567832e+06, 0.6252951493434797e+07,  -0.4259392165047669e+09,
+      0.3646840080706556e+11,  -0.3833534661393944e+13, 0.4854014686852901e+15};
+  const RealType b[12] = {0.732421875e-01,        -0.2271080017089844e+00,
+                          0.1727727502584457e+01, -0.2438052969955606e+02,
+                          0.5513358961220206e+03, -0.1825775547429318e+05,
+                          0.8328593040162893e+06, -0.5006958953198893e+08,
+                          0.3836255180230433e+10, -0.3649010818849833e+12,
+                          0.4218971570284096e+14, -0.5827244631566907e+16};
+
+  RealType r2p = 2.0 / pi;
+  RealType a0  = Kokkos::abs(z);
+  RealType y0  = fabs(z.imag());
+  CmplxType ci = CmplxType(0.0, 1.0);
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cby0 = -CmplxType(inf, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:25)
+      CmplxType cbs = CmplxType(0.0, 0.0);
+      CmplxType csu = CmplxType(0.0, 0.0);
+      CmplxType csv = CmplxType(0.0, 0.0);
+      CmplxType cf2 = CmplxType(0.0, 0.0);
+      CmplxType cf1 = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0, ce;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 70)
+        cf                    = 2.0 * (k + 1.0) / z * cf1 - cf2;
+        RealType tmp_exponent = static_cast<RealType>(k / 2);
+        if (k == 0) cbj0 = cf;
+        if ((k == 2 * (k / 2)) && (k != 0)) {
+          if (y0 <= 1.0)
+            cbs = cbs + 2.0 * cf;
+          else
+            cbs = cbs + pow(-1.0, tmp_exponent) * 2.0 * cf;
+          csu = csu + pow(-1.0, tmp_exponent) * cf / k;
+        } else if (k > 1) {
+          csv = csv + pow(-1.0, tmp_exponent) * k / (k * k - 1.0) * cf;
+        }
+        cf2 = cf1;
+        cf1 = cf;
+      }
+      if (y0 <= 1.0)
+        cs0 = cbs + cf;
+      else
+        cs0 = (cbs + cf) / Kokkos::cos(z);
+      cbj0 = cbj0 / cs0;
+      ce   = Kokkos::log(z / 2.0) + el;
+      cby0 = r2p * (ce * cbj0 - 4.0 * csu / cs0);
+    } else {  // Using asymptotic expansion (5.2.6) for |z|>joint_val
+              // (default:25)
+      CmplxType ct1 = z1 - 0.25 * pi;
+      CmplxType cp0 = CmplxType(1.0, 0.0);
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.9)
+        cp0 = cp0 + a[k - 1] * Kokkos::pow(z1, -2.0 * k);
+      }
+      CmplxType cq0 = -0.125 / z1;
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.10)
+        cq0 = cq0 + b[k - 1] * Kokkos::pow(z1, -2.0 * k - 1);
+      }
+      CmplxType cu = Kokkos::sqrt(r2p / z1);
+      cbj0         = cu * (cp0 * Kokkos::cos(ct1) - cq0 * Kokkos::sin(ct1));
+      cby0         = cu * (cp0 * Kokkos::sin(ct1) + cq0 * Kokkos::cos(ct1));
+
+      if (z.real() < 0.0) {  // Apply (5.4.2)
+        if (z.imag() < 0.0) cby0 = cby0 - 2.0 * ci * cbj0;
+        if (z.imag() >= 0.0) cby0 = cby0 + 2.0 * ci * cbj0;
+      }
+    }
+  }
+  return cby0;
+}
+
+//! Compute Bessel function J1(z) of the first kind of order one
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_j1(const CmplxType& z,
+                                               const RealType& joint_val = 25,
+                                               const IntType& bw_start   = 70) {
+  // This function is converted and modified from the corresponding Fortran
+  // program CJYNB in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cbj1      --- J1(z)
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::pow;
+
+  CmplxType cbj1;
+  const RealType pi     = M_PI;
+  const RealType a1[12] = {0.1171875e+00,          -0.144195556640625e+00,
+                           0.6765925884246826e+00, -0.6883914268109947e+01,
+                           0.1215978918765359e+03, -0.3302272294480852e+04,
+                           0.1276412726461746e+06, -0.6656367718817688e+07,
+                           0.4502786003050393e+09, -0.3833857520742790e+11,
+                           0.4011838599133198e+13, -0.5060568503314727e+15};
+  const RealType b1[12] = {
+      -0.1025390625e+00,       0.2775764465332031e+00,  -0.1993531733751297e+01,
+      0.2724882731126854e+02,  -0.6038440767050702e+03, 0.1971837591223663e+05,
+      -0.8902978767070678e+06, 0.5310411010968522e+08,  -0.4043620325107754e+10,
+      0.3827011346598605e+12,  -0.4406481417852278e+14, 0.6065091351222699e+16};
+
+  RealType r2p = 2.0 / pi;
+  RealType a0  = Kokkos::abs(z);
+  RealType y0  = fabs(z.imag());
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cbj1 = CmplxType(0.0, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:25)
+      CmplxType cbs = CmplxType(0.0, 0.0);
+      CmplxType csu = CmplxType(0.0, 0.0);
+      CmplxType csv = CmplxType(0.0, 0.0);
+      CmplxType cf2 = CmplxType(0.0, 0.0);
+      CmplxType cf1 = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 70)
+        cf                    = 2.0 * (k + 1.0) / z * cf1 - cf2;
+        RealType tmp_exponent = static_cast<RealType>(k / 2);
+        if (k == 1) cbj1 = cf;
+        if ((k == 2 * (k / 2)) && (k != 0)) {
+          if (y0 <= 1.0)
+            cbs = cbs + 2.0 * cf;
+          else
+            cbs = cbs + pow(-1.0, tmp_exponent) * 2.0 * cf;
+          csu = csu + pow(-1.0, tmp_exponent) * cf / k;
+        } else if (k > 1) {
+          csv = csv + pow(-1.0, tmp_exponent) * k / (k * k - 1.0) * cf;
+        }
+        cf2 = cf1;
+        cf1 = cf;
+      }
+      if (y0 <= 1.0)
+        cs0 = cbs + cf;
+      else
+        cs0 = (cbs + cf) / Kokkos::cos(z);
+      cbj1 = cbj1 / cs0;
+    } else {  // Using asymptotic expansion (5.2.5) for |z|>joint_val
+              // (default:25)
+      CmplxType ct2 = z1 - 0.75 * pi;
+      CmplxType cp1 = CmplxType(1.0, 0.0);
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.11)
+        cp1 = cp1 + a1[k - 1] * Kokkos::pow(z1, -2.0 * k);
+      }
+      CmplxType cq1 = 0.375 / z1;
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.12)
+        cq1 = cq1 + b1[k - 1] * Kokkos::pow(z1, -2.0 * k - 1);
+      }
+      CmplxType cu = Kokkos::sqrt(r2p / z1);
+      cbj1         = cu * (cp1 * Kokkos::cos(ct2) - cq1 * Kokkos::sin(ct2));
+
+      if (real(z) < 0.0) {  // Apply (5.4.2)
+        cbj1 = -cbj1;
+      }
+    }
+  }
+  return cbj1;
+}
+
+//! Compute Bessel function Y1(z) of the second kind of order one
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_y1(const CmplxType& z,
+                                               const RealType& joint_val = 25,
+                                               const IntType& bw_start   = 70) {
+  // This function is converted and modified from the corresponding Fortran
+  // program CJYNB in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cby1      --- Y1(z)
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::pow;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType cby1, cbj0, cbj1, cby0;
+  const RealType pi     = M_PI;
+  const RealType el     = 0.57721566490153286060651209008240;
+  const RealType a1[12] = {0.1171875e+00,          -0.144195556640625e+00,
+                           0.6765925884246826e+00, -0.6883914268109947e+01,
+                           0.1215978918765359e+03, -0.3302272294480852e+04,
+                           0.1276412726461746e+06, -0.6656367718817688e+07,
+                           0.4502786003050393e+09, -0.3833857520742790e+11,
+                           0.4011838599133198e+13, -0.5060568503314727e+15};
+  const RealType b1[12] = {
+      -0.1025390625e+00,       0.2775764465332031e+00,  -0.1993531733751297e+01,
+      0.2724882731126854e+02,  -0.6038440767050702e+03, 0.1971837591223663e+05,
+      -0.8902978767070678e+06, 0.5310411010968522e+08,  -0.4043620325107754e+10,
+      0.3827011346598605e+12,  -0.4406481417852278e+14, 0.6065091351222699e+16};
+
+  RealType r2p = 2.0 / pi;
+  RealType a0  = Kokkos::abs(z);
+  RealType y0  = fabs(z.imag());
+  CmplxType ci = CmplxType(0.0, 1.0);
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cby1 = -CmplxType(inf, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:25)
+      CmplxType cbs = CmplxType(0.0, 0.0);
+      CmplxType csu = CmplxType(0.0, 0.0);
+      CmplxType csv = CmplxType(0.0, 0.0);
+      CmplxType cf2 = CmplxType(0.0, 0.0);
+      CmplxType cf1 = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0, ce;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 70)
+        cf                    = 2.0 * (k + 1.0) / z * cf1 - cf2;
+        RealType tmp_exponent = static_cast<RealType>(k / 2);
+        if (k == 1) cbj1 = cf;
+        if (k == 0) cbj0 = cf;
+        if ((k == 2 * (k / 2)) && (k != 0)) {
+          if (y0 <= 1.0)
+            cbs = cbs + 2.0 * cf;
+          else
+            cbs = cbs + pow(-1.0, tmp_exponent) * 2.0 * cf;
+          csu = csu + pow(-1.0, tmp_exponent) * cf / k;
+        } else if (k > 1) {
+          csv = csv + pow(-1.0, tmp_exponent) * k / (k * k - 1.0) * cf;
+        }
+        cf2 = cf1;
+        cf1 = cf;
+      }
+      if (y0 <= 1.0)
+        cs0 = cbs + cf;
+      else
+        cs0 = (cbs + cf) / Kokkos::cos(z);
+      cbj0 = cbj0 / cs0;
+      ce   = Kokkos::log(z / 2.0) + el;
+      cby0 = r2p * (ce * cbj0 - 4.0 * csu / cs0);
+      cbj1 = cbj1 / cs0;
+      cby1 = (cbj1 * cby0 - 2.0 / (pi * z)) / cbj0;
+    } else {  // Using asymptotic expansion (5.2.5) for |z|>joint_val
+              // (default:25)
+      CmplxType ct2 = z1 - 0.75 * pi;
+      CmplxType cp1 = CmplxType(1.0, 0.0);
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.11)
+        cp1 = cp1 + a1[k - 1] * Kokkos::pow(z1, -2.0 * k);
+      }
+      CmplxType cq1 = 0.375 / z1;
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.12)
+        cq1 = cq1 + b1[k - 1] * Kokkos::pow(z1, -2.0 * k - 1);
+      }
+      CmplxType cu = Kokkos::sqrt(r2p / z1);
+      cbj1         = cu * (cp1 * Kokkos::cos(ct2) - cq1 * Kokkos::sin(ct2));
+      cby1         = cu * (cp1 * Kokkos::sin(ct2) + cq1 * Kokkos::cos(ct2));
+
+      if (z.real() < 0.0) {  // Apply (5.4.2)
+        if (z.imag() < 0.0) cby1 = -(cby1 - 2.0 * ci * cbj1);
+        if (z.imag() >= 0.0) cby1 = -(cby1 + 2.0 * ci * cbj1);
+      }
+    }
+  }
+  return cby1;
+}
+
+//! Compute modified Bessel function I0(z) of the first kind of order zero
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_i0(const CmplxType& z,
+                                               const RealType& joint_val = 25,
+                                               const IntType& bw_start   = 70) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CIKNB and CIK01 in S. Zhang & J. Jin "Computation of Special
+  // Functions" (Wiley, 1996).
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cbi0      --- I0(z)
+  CmplxType cbi0;
+  const RealType pi    = M_PI;
+  const RealType a[12] = {0.125,
+                          7.03125e-2,
+                          7.32421875e-2,
+                          1.1215209960938e-1,
+                          2.2710800170898e-1,
+                          5.7250142097473e-1,
+                          1.7277275025845e0,
+                          6.0740420012735e0,
+                          2.4380529699556e1,
+                          1.1001714026925e2,
+                          5.5133589612202e2,
+                          3.0380905109224e3};
+
+  RealType a0  = Kokkos::abs(z);
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cbi0 = CmplxType(1.0, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:25)
+      CmplxType cbs = CmplxType(0.0, 0.0);
+      // CmplxType csk0 = CmplxType(0.0,0.0);
+      CmplxType cf0 = CmplxType(0.0, 0.0);
+      CmplxType cf1 = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 70)
+        cf = 2.0 * (k + 1.0) * cf1 / z1 + cf0;
+        if (k == 0) cbi0 = cf;
+        // if ((k == 2*(k/2)) && (k != 0)) {
+        //  csk0 = csk0+4.0*cf/static_cast<RealType>(k);
+        //}
+        cbs = cbs + 2.0 * cf;
+        cf0 = cf1;
+        cf1 = cf;
+      }
+      cs0  = Kokkos::exp(z1) / (cbs - cf);
+      cbi0 = cbi0 * cs0;
+    } else {  // Using asymptotic expansion (6.2.1) for |z|>joint_val
+              // (default:25)
+      CmplxType ca = Kokkos::exp(z1) / Kokkos::sqrt(2.0 * pi * z1);
+      cbi0         = CmplxType(1.0, 0.0);
+      CmplxType zr = 1.0 / z1;
+      for (int k = 1; k <= 12; k++) {
+        cbi0 = cbi0 + a[k - 1] * Kokkos::pow(zr, 1.0 * k);
+      }
+      cbi0 = ca * cbi0;
+    }
+  }
+  return cbi0;
+}
+
+//! Compute modified Bessel function K0(z) of the second kind of order zero
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_k0(const CmplxType& z,
+                                               const RealType& joint_val = 9,
+                                               const IntType& bw_start   = 30) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CIKNB and CIK01 in S. Zhang & J. Jin "Computation of Special
+  // Functions" (Wiley, 1996).
+  //    Purpose: Compute modified Bessel function K0(z) of the second kind of
+  //             order zero for a complex argument
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cbk0      --- K0(z)
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::pow;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType cbk0, cbi0;
+  const RealType pi = M_PI;
+  const RealType el = 0.57721566490153286060651209008240;
+
+  RealType a0  = Kokkos::abs(z);
+  CmplxType ci = CmplxType(0.0, 1.0);
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cbk0 = CmplxType(inf, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:9)
+      CmplxType cbs  = CmplxType(0.0, 0.0);
+      CmplxType csk0 = CmplxType(0.0, 0.0);
+      CmplxType cf0  = CmplxType(0.0, 0.0);
+      CmplxType cf1  = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 30)
+        cf = 2.0 * (k + 1.0) * cf1 / z1 + cf0;
+        if (k == 0) cbi0 = cf;
+        if ((k == 2 * (k / 2)) && (k != 0)) {
+          csk0 = csk0 + 4.0 * cf / static_cast<RealType>(k);
+        }
+        cbs = cbs + 2.0 * cf;
+        cf0 = cf1;
+        cf1 = cf;
+      }
+      cs0  = Kokkos::exp(z1) / (cbs - cf);
+      cbi0 = cbi0 * cs0;
+      cbk0 = -(Kokkos::log(0.5 * z1) + el) * cbi0 + cs0 * csk0;
+    } else {  // Using asymptotic expansion (6.2.2) for |z|>joint_val
+              // (default:9)
+      CmplxType ca0  = Kokkos::sqrt(pi / (2.0 * z1)) * Kokkos::exp(-z1);
+      CmplxType cbkl = CmplxType(1.0, 0.0);
+      CmplxType cr   = CmplxType(1.0, 0.0);
+      for (int k = 1; k <= 30; k++) {
+        cr   = 0.125 * cr * (0.0 - pow(2.0 * k - 1.0, 2.0)) / (k * z1);
+        cbkl = cbkl + cr;
+      }
+      cbk0 = ca0 * cbkl;
+    }
+    if (z.real() < 0.0) {  // Apply (6.4.4)
+      if (z.imag() < 0.0)
+        cbk0 = cbk0 + ci * pi * cyl_bessel_i0<CmplxType, RealType, IntType>(z);
+      if (z.imag() >= 0.0)
+        cbk0 = cbk0 - ci * pi * cyl_bessel_i0<CmplxType, RealType, IntType>(z);
+    }
+  }
+  return cbk0;
+}
+
+//! Compute modified Bessel function I1(z) of the first kind of order one
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_i1(const CmplxType& z,
+                                               const RealType& joint_val = 25,
+                                               const IntType& bw_start   = 70) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CIKNB and CIK01 in S. Zhang & J. Jin "Computation of Special
+  // Functions" (Wiley, 1996).
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cbi1      --- I1(z)
+  CmplxType cbi1;
+  const RealType pi    = M_PI;
+  const RealType b[12] = {-0.375,
+                          -1.171875e-1,
+                          -1.025390625e-1,
+                          -1.4419555664063e-1,
+                          -2.7757644653320e-1,
+                          -6.7659258842468e-1,
+                          -1.9935317337513,
+                          -6.8839142681099,
+                          -2.7248827311269e1,
+                          -1.2159789187654e2,
+                          -6.0384407670507e2,
+                          -3.3022722944809e3};
+
+  RealType a0  = Kokkos::abs(z);
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cbi1 = CmplxType(0.0, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:25)
+      CmplxType cbs = CmplxType(0.0, 0.0);
+      // CmplxType csk0 = CmplxType(0.0,0.0);
+      CmplxType cf0 = CmplxType(0.0, 0.0);
+      CmplxType cf1 = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 70)
+        cf = 2.0 * (k + 1.0) * cf1 / z1 + cf0;
+        if (k == 1) cbi1 = cf;
+        // if ((k == 2*(k/2)) && (k != 0)) {
+        //  csk0 = csk0+4.0*cf/static_cast<RealType>(k);
+        //}
+        cbs = cbs + 2.0 * cf;
+        cf0 = cf1;
+        cf1 = cf;
+      }
+      cs0  = Kokkos::exp(z1) / (cbs - cf);
+      cbi1 = cbi1 * cs0;
+    } else {  // Using asymptotic expansion (6.2.1) for |z|>joint_val
+              // (default:25)
+      CmplxType ca = Kokkos::exp(z1) / Kokkos::sqrt(2.0 * pi * z1);
+      cbi1         = CmplxType(1.0, 0.0);
+      CmplxType zr = 1.0 / z1;
+      for (int k = 1; k <= 12; k++) {
+        cbi1 = cbi1 + b[k - 1] * Kokkos::pow(zr, 1.0 * k);
+      }
+      cbi1 = ca * cbi1;
+    }
+    if (z.real() < 0.0) {  // Apply (6.4.4)
+      cbi1 = -cbi1;
+    }
+  }
+  return cbi1;
+}
+
+//! Compute modified Bessel function K1(z) of the second kind of order one
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_k1(const CmplxType& z,
+                                               const RealType& joint_val = 9,
+                                               const IntType& bw_start   = 30) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CIKNB and CIK01 in S. Zhang & J. Jin "Computation of Special
+  // Functions" (Wiley, 1996).
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cbk1      --- K1(z)
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::pow;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType cbk0, cbi0, cbk1, cbi1;
+  const RealType pi = M_PI;
+  const RealType el = 0.57721566490153286060651209008240;
+
+  RealType a0  = Kokkos::abs(z);
+  CmplxType ci = CmplxType(0.0, 1.0);
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cbk1 = CmplxType(inf, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:9)
+      CmplxType cbs  = CmplxType(0.0, 0.0);
+      CmplxType csk0 = CmplxType(0.0, 0.0);
+      CmplxType cf0  = CmplxType(0.0, 0.0);
+      CmplxType cf1  = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 30)
+        cf = 2.0 * (k + 1.0) * cf1 / z1 + cf0;
+        if (k == 1) cbi1 = cf;
+        if (k == 0) cbi0 = cf;
+        if ((k == 2 * (k / 2)) && (k != 0)) {
+          csk0 = csk0 + 4.0 * cf / static_cast<RealType>(k);
+        }
+        cbs = cbs + 2.0 * cf;
+        cf0 = cf1;
+        cf1 = cf;
+      }
+      cs0  = Kokkos::exp(z1) / (cbs - cf);
+      cbi0 = cbi0 * cs0;
+      cbi1 = cbi1 * cs0;
+      cbk0 = -(Kokkos::log(0.5 * z1) + el) * cbi0 + cs0 * csk0;
+      cbk1 = (1.0 / z1 - cbi1 * cbk0) / cbi0;
+    } else {  // Using asymptotic expansion (6.2.2) for |z|>joint_val
+              // (default:9)
+      CmplxType ca0  = Kokkos::sqrt(pi / (2.0 * z1)) * Kokkos::exp(-z1);
+      CmplxType cbkl = CmplxType(1.0, 0.0);
+      CmplxType cr   = CmplxType(1.0, 0.0);
+      for (int k = 1; k <= 30; k++) {
+        cr   = 0.125 * cr * (4.0 - pow(2.0 * k - 1.0, 2.0)) / (k * z1);
+        cbkl = cbkl + cr;
+      }
+      cbk1 = ca0 * cbkl;
+    }
+    if (z.real() < 0.0) {  // Apply (6.4.4)
+      if (z.imag() < 0.0)
+        cbk1 = -cbk1 - ci * pi * cyl_bessel_i1<CmplxType, RealType, IntType>(z);
+      if (z.imag() >= 0.0)
+        cbk1 = -cbk1 + ci * pi * cyl_bessel_i1<CmplxType, RealType, IntType>(z);
+    }
+  }
+  return cbk1;
+}
+
+//! Compute Hankel function H10(z) of the first kind of order zero
+//! for a complex argument
+template <class CmplxType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_h10(const CmplxType& z) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CH12N in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  using RealType = typename CmplxType::value_type;
+  using Kokkos::Experimental::infinity;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType ch10, cbk0, cbj0, cby0;
+  const RealType pi = M_PI;
+  CmplxType ci      = CmplxType(0.0, 1.0);
+
+  if ((z.real() == 0.0) && (z.imag() == 0.0)) {
+    ch10 = CmplxType(1.0, -inf);
+  } else if (z.imag() <= 0.0) {
+    cbj0 = cyl_bessel_j0<CmplxType, RealType, int>(z);
+    cby0 = cyl_bessel_y0<CmplxType, RealType, int>(z);
+    ch10 = cbj0 + ci * cby0;
+  } else {  //(z.imag() > 0.0)
+    cbk0 = cyl_bessel_k0<CmplxType, RealType, int>(-ci * z, 18.0, 70);
+    ch10 = 2.0 / (pi * ci) * cbk0;
+  }
+
+  return ch10;
+}
+
+//! Compute Hankel function H11(z) of the first kind of order one
+//! for a complex argument
+template <class CmplxType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_h11(const CmplxType& z) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CH12N in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  using RealType = typename CmplxType::value_type;
+  using Kokkos::Experimental::infinity;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType ch11, cbk1, cbj1, cby1;
+  const RealType pi = M_PI;
+  CmplxType ci      = CmplxType(0.0, 1.0);
+
+  if ((z.real() == 0.0) && (z.imag() == 0.0)) {
+    ch11 = CmplxType(0.0, -inf);
+  } else if (z.imag() <= 0.0) {
+    cbj1 = cyl_bessel_j1<CmplxType, RealType, int>(z);
+    cby1 = cyl_bessel_y1<CmplxType, RealType, int>(z);
+    ch11 = cbj1 + ci * cby1;
+  } else {  //(z.imag() > 0.0)
+    cbk1 = cyl_bessel_k1<CmplxType, RealType, int>(-ci * z, 18.0, 70);
+    ch11 = -2.0 / pi * cbk1;
+  }
+
+  return ch11;
+}
+
+//! Compute Hankel function H20(z) of the second kind of order zero
+//! for a complex argument
+template <class CmplxType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_h20(const CmplxType& z) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CH12N in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  using RealType = typename CmplxType::value_type;
+  using Kokkos::Experimental::infinity;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType ch20, cbk0, cbj0, cby0;
+  const RealType pi = M_PI;
+  CmplxType ci      = CmplxType(0.0, 1.0);
+
+  if ((z.real() == 0.0) && (z.imag() == 0.0)) {
+    ch20 = CmplxType(1.0, inf);
+  } else if (z.imag() >= 0.0) {
+    cbj0 = cyl_bessel_j0<CmplxType, RealType, int>(z);
+    cby0 = cyl_bessel_y0<CmplxType, RealType, int>(z);
+    ch20 = cbj0 - ci * cby0;
+  } else {  //(z.imag() < 0.0)
+    cbk0 = cyl_bessel_k0<CmplxType, RealType, int>(ci * z, 18.0, 70);
+    ch20 = 2.0 / pi * ci * cbk0;
+  }
+
+  return ch20;
+}
+
+//! Compute Hankel function H21(z) of the second kind of order one
+//! for a complex argument
+template <class CmplxType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_h21(const CmplxType& z) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CH12N in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  using RealType = typename CmplxType::value_type;
+  using Kokkos::Experimental::infinity;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType ch21, cbk1, cbj1, cby1;
+  const RealType pi = M_PI;
+  CmplxType ci      = CmplxType(0.0, 1.0);
+
+  if ((z.real() == 0.0) && (z.imag() == 0.0)) {
+    ch21 = CmplxType(0.0, inf);
+  } else if (z.imag() >= 0.0) {
+    cbj1 = cyl_bessel_j1<CmplxType, RealType, int>(z);
+    cby1 = cyl_bessel_y1<CmplxType, RealType, int>(z);
+    ch21 = cbj1 - ci * cby1;
+  } else {  //(z.imag() < 0.0)
+    cbk1 = cyl_bessel_k1<CmplxType, RealType, int>(ci * z, 18.0, 70);
+    ch21 = -2.0 / pi * cbk1;
+  }
+
+  return ch21;
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/Kokkos_MemoryPool.hpp b/packages/kokkos/core/src/Kokkos_MemoryPool.hpp
index 2cafac1aea462ec29fe1d1cb853cb374ea7e8109..c814e5a22a32d31e1047f52ac55438934c8194d3 100644
--- a/packages/kokkos/core/src/Kokkos_MemoryPool.hpp
+++ b/packages/kokkos/core/src/Kokkos_MemoryPool.hpp
@@ -524,7 +524,7 @@ class MemoryPool {
     // Fast query clock register 'tic' to pseudo-randomize
     // the guess for which block within a superblock should
     // be claimed.  If not available then a search occurs.
-#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GEN)
+#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU)
     const uint32_t block_id_hint = alloc_size;
 #else
     const uint32_t block_id_hint =
@@ -585,19 +585,6 @@ class MemoryPool {
               (uint64_t(sb_id) << m_sb_size_lg2)       // superblock memory
               + (uint64_t(result.first) << size_lg2);  // block memory
 
-#if 0
-  printf( "  MemoryPool(0x%lx) pointer(0x%lx) allocate(%lu) sb_id(%d) sb_state(0x%x) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
-        , (uintptr_t)m_sb_state_array
-        , (uintptr_t)p
-        , alloc_size
-        , sb_id
-        , sb_state 
-        , (1u << size_lg2)
-        , (1u << count_lg2)
-        , result.first 
-        , result.second );
-#endif
-
           break;  // Success
         }
       }
@@ -740,7 +727,8 @@ class MemoryPool {
 
     // Determine which superblock and block
     const ptrdiff_t d =
-        ((char *)p) - ((char *)(m_sb_state_array + m_data_offset));
+        static_cast<char *>(p) -
+        reinterpret_cast<char *>(m_sb_state_array + m_data_offset);
 
     // Verify contained within the memory pool's superblocks:
     const int ok_contains =
@@ -772,29 +760,10 @@ class MemoryPool {
         const int result = CB::release(sb_state_array, bit, block_state);
 
         ok_dealloc_once = 0 <= result;
-
-#if 0
-  printf( "  MemoryPool(0x%lx) pointer(0x%lx) deallocate sb_id(%d) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
-        , (uintptr_t)m_sb_state_array
-        , (uintptr_t)p
-        , sb_id
-        , (1u << block_size_lg2)
-        , (1u << (m_sb_size_lg2 - block_size_lg2))
-        , bit
-        , result );
-#endif
       }
     }
 
     if (!ok_contains || !ok_block_aligned || !ok_dealloc_once) {
-#if 0
-  printf( "  MemoryPool(0x%lx) pointer(0x%lx) deallocate ok_contains(%d) ok_block_aligned(%d) ok_dealloc_once(%d)\n"
-        , (uintptr_t)m_sb_state_array
-        , (uintptr_t)p
-        , int(ok_contains)
-        , int(ok_block_aligned)
-        , int(ok_dealloc_once) );
-#endif
       Kokkos::abort("Kokkos MemoryPool::deallocate given erroneous pointer");
     }
   }
diff --git a/packages/kokkos/core/src/Kokkos_MemoryTraits.hpp b/packages/kokkos/core/src/Kokkos_MemoryTraits.hpp
index f23442b793f5eeca8e0c1b22df6468271df96b73..e3cee93e257b154b73df1d0c40514040d7083f22 100644
--- a/packages/kokkos/core/src/Kokkos_MemoryTraits.hpp
+++ b/packages/kokkos/core/src/Kokkos_MemoryTraits.hpp
@@ -46,7 +46,6 @@
 #define KOKKOS_MEMORYTRAITS_HPP
 
 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 //----------------------------------------------------------------------------
 
@@ -119,6 +118,15 @@ enum : unsigned {
   MEMORY_ALIGNMENT_THRESHOLD = KOKKOS_MEMORY_ALIGNMENT_THRESHOLD
 };
 
+// ------------------------------------------------------------------ //
+//  this identifies the default memory trait
+//
+template <typename Tp>
+struct is_default_memory_trait : std::false_type {};
+
+template <>
+struct is_default_memory_trait<Kokkos::MemoryTraits<0>> : std::true_type {};
+
 }  // namespace Impl
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/core/src/Kokkos_NumericTraits.hpp b/packages/kokkos/core/src/Kokkos_NumericTraits.hpp
index b9380cbe02b42a04c5b21b6cb8408016049d15f8..1999d46f3c4087dab1192c350da8ba844199a8fe 100644
--- a/packages/kokkos/core/src/Kokkos_NumericTraits.hpp
+++ b/packages/kokkos/core/src/Kokkos_NumericTraits.hpp
@@ -56,11 +56,11 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 // clang-format off
-template <class> struct infinity_helper;
+template <class> struct infinity_helper {};
 template <> struct infinity_helper<float> { static constexpr float value = HUGE_VALF; };
 template <> struct infinity_helper<double> { static constexpr double value = HUGE_VAL; };
 template <> struct infinity_helper<long double> { static constexpr long double value = HUGE_VALL; };
-template <class> struct finite_min_helper;
+template <class> struct finite_min_helper {};
 template <> struct finite_min_helper<bool> { static constexpr bool value = false; };
 template <> struct finite_min_helper<char> { static constexpr char value = CHAR_MIN; };
 template <> struct finite_min_helper<signed char> { static constexpr signed char value = SCHAR_MIN; };
@@ -76,7 +76,7 @@ template <> struct finite_min_helper<unsigned long long int> { static constexpr
 template <> struct finite_min_helper<float> { static constexpr float value = -FLT_MAX; };
 template <> struct finite_min_helper<double> { static constexpr double value = -DBL_MAX; };
 template <> struct finite_min_helper<long double> { static constexpr long double value = -LDBL_MAX; };
-template <class> struct finite_max_helper;
+template <class> struct finite_max_helper {};
 template <> struct finite_max_helper<bool> { static constexpr bool value = true; };
 template <> struct finite_max_helper<char> { static constexpr char value = CHAR_MAX; };
 template <> struct finite_max_helper<signed char> { static constexpr signed char value = SCHAR_MAX; };
@@ -92,7 +92,7 @@ template <> struct finite_max_helper<unsigned long long int> { static constexpr
 template <> struct finite_max_helper<float> { static constexpr float value = FLT_MAX; };
 template <> struct finite_max_helper<double> { static constexpr double value = DBL_MAX; };
 template <> struct finite_max_helper<long double> { static constexpr long double value = LDBL_MAX; };
-template <class> struct epsilon_helper;
+template <class> struct epsilon_helper {};
 namespace{
   // FIXME workaround for LDL_EPSILON with XL
   template<typename T>
@@ -115,15 +115,15 @@ template <> struct epsilon_helper<long double> {
   static constexpr long double value = LDBL_EPSILON;
 #endif
 };
-template <class> struct round_error_helper;
+template <class> struct round_error_helper {};
 template <> struct round_error_helper<float> { static constexpr float value = 0.5F; };
 template <> struct round_error_helper<double> { static constexpr double value = 0.5; };
 template <> struct round_error_helper<long double> { static constexpr long double value = 0.5L; };
-template <class> struct norm_min_helper;
+template <class> struct norm_min_helper {};
 template <> struct norm_min_helper<float> { static constexpr float value = FLT_MIN; };
 template <> struct norm_min_helper<double> { static constexpr double value = DBL_MIN; };
 template <> struct norm_min_helper<long double> { static constexpr long double value = LDBL_MIN; };
-template <class> struct digits_helper;
+template <class> struct digits_helper {};
 template <> struct digits_helper<bool> { static constexpr int value = 1; };
 template <> struct digits_helper<char> { static constexpr int value = CHAR_BIT - std::is_signed<char>::value; };
 template <> struct digits_helper<signed char> { static constexpr int value = CHAR_BIT - 1; };
@@ -139,11 +139,13 @@ template <> struct digits_helper<unsigned long long int> { static constexpr int
 template <> struct digits_helper<float> { static constexpr int value = FLT_MANT_DIG; };
 template <> struct digits_helper<double> { static constexpr int value = DBL_MANT_DIG; };
 template <> struct digits_helper<long double> { static constexpr int value = LDBL_MANT_DIG; };
-template <class> struct digits10_helper;
+template <class> struct digits10_helper {};
 template <> struct digits10_helper<bool> { static constexpr int value = 0; };
-constexpr double log10_2 = 2.41;
+// The fraction 643/2136 approximates log10(2) to 7 significant digits.
+// Workaround GCC compiler bug with -frounding-math that prevented the
+// floating-point expression to be evaluated at compile time.
 #define DIGITS10_HELPER_INTEGRAL(TYPE) \
-template <> struct digits10_helper<TYPE> { static constexpr int value = digits_helper<TYPE>::value * log10_2; };
+template <> struct digits10_helper<TYPE> { static constexpr int value = digits_helper<TYPE>::value * 643L / 2136; };
 DIGITS10_HELPER_INTEGRAL(char)
 DIGITS10_HELPER_INTEGRAL(signed char)
 DIGITS10_HELPER_INTEGRAL(unsigned char)
@@ -159,15 +161,29 @@ DIGITS10_HELPER_INTEGRAL(unsigned long long int)
 template <> struct digits10_helper<float> { static constexpr int value = FLT_DIG; };
 template <> struct digits10_helper<double> { static constexpr int value = DBL_DIG; };
 template <> struct digits10_helper<long double> { static constexpr int value = LDBL_DIG; };
-template <class> struct max_digits10_helper;
-// FIXME not sure why were not defined in my <cfloat>
-//template <> struct max_digits10_helper<float> { static constexpr int value = FLT_DECIMAL_DIG; };
-//template <> struct max_digits10_helper<double> { static constexpr int value = DBL_DECIMAL_DIG; };
-//template <> struct max_digits10_helper<long double> { static constexpr int value = LDBL_DECIMAL_DIG; };
-template <> struct max_digits10_helper<float> { static constexpr int value = 9; };
-template <> struct max_digits10_helper<double> { static constexpr int value = 17; };
-template <> struct max_digits10_helper<long double> { static constexpr int value = 21; };
-template <class> struct radix_helper;
+template <class> struct max_digits10_helper {};
+// Approximate ceil(digits<T>::value * log10(2) + 1)
+#define MAX_DIGITS10_HELPER(TYPE) \
+template <> struct max_digits10_helper<TYPE> { static constexpr int value = (digits_helper<TYPE>::value * 643L + 2135) / 2136 + 1; };
+#ifdef FLT_DECIMAL_DIG
+template <> struct max_digits10_helper<float> { static constexpr int value = FLT_DECIMAL_DIG; };
+#else
+MAX_DIGITS10_HELPER(float)
+#endif
+#ifdef DBL_DECIMAL_DIG
+template <> struct max_digits10_helper<double> { static constexpr int value = DBL_DECIMAL_DIG; };
+#else
+MAX_DIGITS10_HELPER(double)
+#endif
+#ifdef DECIMAL_DIG
+template <> struct max_digits10_helper<long double> { static constexpr int value = DECIMAL_DIG; };
+#elif LDBL_DECIMAL_DIG
+template <> struct max_digits10_helper<long double> { static constexpr int value = LDBL_DECIMAL_DIG; };
+#else
+MAX_DIGITS10_HELPER(long double)
+#endif
+#undef MAX_DIGITS10_HELPER
+template <class> struct radix_helper {};
 template <> struct radix_helper<bool> { static constexpr int value = 2; };
 template <> struct radix_helper<char> { static constexpr int value = 2; };
 template <> struct radix_helper<signed char> { static constexpr int value = 2; };
@@ -183,19 +199,19 @@ template <> struct radix_helper<unsigned long long int> { static constexpr int v
 template <> struct radix_helper<float> { static constexpr int value = FLT_RADIX; };
 template <> struct radix_helper<double> { static constexpr int value = FLT_RADIX; };
 template <> struct radix_helper<long double> { static constexpr int value = FLT_RADIX; };
-template <class> struct min_exponent_helper;
+template <class> struct min_exponent_helper {};
 template <> struct min_exponent_helper<float> { static constexpr int value = FLT_MIN_EXP; };
 template <> struct min_exponent_helper<double> { static constexpr int value = DBL_MIN_EXP; };
 template <> struct min_exponent_helper<long double> { static constexpr int value = LDBL_MIN_EXP; };
-template <class> struct min_exponent10_helper;
+template <class> struct min_exponent10_helper {};
 template <> struct min_exponent10_helper<float> { static constexpr int value = FLT_MIN_10_EXP; };
 template <> struct min_exponent10_helper<double> { static constexpr int value = DBL_MIN_10_EXP; };
 template <> struct min_exponent10_helper<long double> { static constexpr int value = LDBL_MIN_10_EXP; };
-template <class> struct max_exponent_helper;
+template <class> struct max_exponent_helper {};
 template <> struct max_exponent_helper<float> { static constexpr int value = FLT_MAX_EXP; };
 template <> struct max_exponent_helper<double> { static constexpr int value = DBL_MAX_EXP; };
 template <> struct max_exponent_helper<long double> { static constexpr int value = LDBL_MAX_EXP; };
-template <class> struct max_exponent10_helper;
+template <class> struct max_exponent10_helper{};
 template <> struct max_exponent10_helper<float> { static constexpr int value = FLT_MAX_10_EXP; };
 template <> struct max_exponent10_helper<double> { static constexpr int value = DBL_MAX_10_EXP; };
 template <> struct max_exponent10_helper<long double> { static constexpr int value = LDBL_MAX_10_EXP; };
diff --git a/packages/kokkos/core/src/Kokkos_OpenMP.hpp b/packages/kokkos/core/src/Kokkos_OpenMP.hpp
index eedba38a8456117ac03d8c21e657729673017984..8f12eceb27c46946a83c64eceec0711cca6ef2b7 100644
--- a/packages/kokkos/core/src/Kokkos_OpenMP.hpp
+++ b/packages/kokkos/core/src/Kokkos_OpenMP.hpp
@@ -62,7 +62,6 @@
 #include <Kokkos_Parallel.hpp>
 #include <Kokkos_TaskScheduler.hpp>
 #include <Kokkos_Layout.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
 
@@ -105,9 +104,11 @@ class OpenMP {
   /// \brief Wait until all dispatched functors complete on the given instance
   ///
   ///  This is a no-op on OpenMP
-  static void impl_static_fence(OpenMP const& = OpenMP()) noexcept;
+  static void impl_static_fence(OpenMP const&           = OpenMP(),
+                                const std::string& name = "") noexcept;
 
   void fence() const;
+  void fence(const std::string& name) const;
 
   /// \brief Does the given instance return immediately after launching
   /// a parallel algorithm
@@ -167,7 +168,7 @@ class OpenMP {
   static int impl_get_current_max_threads() noexcept;
 
   static constexpr const char* name() noexcept { return "OpenMP"; }
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept { return 1; }
 };
 
 namespace Tools {
@@ -188,6 +189,7 @@ class OpenMPSpaceInitializer : public ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
diff --git a/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp b/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp
index 2a57a43e63b77b7f60e4cc40bb20272e0332944a..f394f3240832a67f22e4056fa27e33500b38178c 100644
--- a/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp
+++ b/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp
@@ -56,9 +56,8 @@
 #include <Kokkos_OpenMPTargetSpace.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_Parallel.hpp>
-#include <Kokkos_TaskPolicy.hpp>
+#include <Kokkos_TaskScheduler.hpp>
 #include <Kokkos_Layout.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #include <KokkosExp_MDRangePolicy.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
@@ -92,7 +91,10 @@ class OpenMPTarget {
   inline static bool in_parallel() { return omp_in_parallel(); }
 
   static void fence();
+  static void fence(const std::string&);
 
+  static void impl_static_fence();
+  static void impl_static_fence(const std::string&);
   /** \brief  Return the maximum amount of concurrency.  */
   static int concurrency();
 
@@ -115,7 +117,7 @@ class OpenMPTarget {
   }
 
   OpenMPTarget();
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept;
 
  private:
   Impl::OpenMPTargetInternal* m_space_instance;
@@ -141,6 +143,7 @@ class OpenMPTargetSpaceInitializer : public ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
diff --git a/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp b/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
index 58d723ac110a2bfd2266d4055f9c222c4a2c2c78..c1d338331f56cd59a9eb917a2d8f72ebb06b453b 100644
--- a/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
@@ -89,6 +89,41 @@ namespace Impl {
 }  // namespace Impl
 }  // namespace Kokkos
 
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------
+
+template <>
+struct MemorySpaceAccess<Kokkos::HostSpace,
+                         Kokkos::Experimental::OpenMPTargetSpace> {
+  enum : bool { assignable = false };
+  enum : bool { accessible = false };
+  enum : bool { deepcopy = true };
+};
+
+//----------------------------------------
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                         Kokkos::HostSpace> {
+  enum : bool { assignable = false };
+  enum : bool { accessible = false };
+  enum : bool { deepcopy = true };
+};
+
+//----------------------------------------
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                         Kokkos::Experimental::OpenMPTargetSpace> {
+  enum : bool { assignable = true };
+  enum : bool { accessible = true };
+  enum : bool { deepcopy = false };
+};
+}  // namespace Impl
+}  // namespace Kokkos
+
 namespace Kokkos {
 namespace Experimental {
 
@@ -213,7 +248,10 @@ struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace,
                                        omp_get_default_device()));
   }
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<OpenMPTargetSpace, OpenMPTargetSpace>: fence "
+        "before "
+        "copy");
     if (n > 0)
       OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0,
                                        omp_get_default_device(),
@@ -231,7 +269,9 @@ struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace, HostSpace,
                                        omp_get_initial_device()));
   }
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<OpenMPTargetSpace, HostSpace>: fence before "
+        "copy");
     if (n > 0)
       OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0,
                                        omp_get_default_device(),
@@ -249,7 +289,9 @@ struct DeepCopy<HostSpace, Kokkos::Experimental::OpenMPTargetSpace,
                                        omp_get_default_device()));
   }
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<HostSpace, OpenMPTargetSpace>: fence before "
+        "copy");
     if (n > 0)
       OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0,
                                        omp_get_initial_device(),
diff --git a/packages/kokkos/core/src/Kokkos_Parallel.hpp b/packages/kokkos/core/src/Kokkos_Parallel.hpp
index 85d1dad454ba64aa1311cf19437206768018571b..25ebe26155fed5812e161e14360811cfc660e105 100644
--- a/packages/kokkos/core/src/Kokkos_Parallel.hpp
+++ b/packages/kokkos/core/src/Kokkos_Parallel.hpp
@@ -48,23 +48,19 @@
 #ifndef KOKKOS_PARALLEL_HPP
 #define KOKKOS_PARALLEL_HPP
 
-#include <cstddef>
 #include <Kokkos_Core_fwd.hpp>
-#include <Kokkos_View.hpp>
+#include <Kokkos_DetectionIdiom.hpp>
 #include <Kokkos_ExecPolicy.hpp>
+#include <Kokkos_View.hpp>
 
 #include <impl/Kokkos_Tools.hpp>
-#include <type_traits>
-#include <typeinfo>
-
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_FunctorAnalysis.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 
-#ifdef KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-#include <iostream>
-#endif
+#include <cstddef>
+#include <type_traits>
+#include <typeinfo>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -72,34 +68,11 @@
 namespace Kokkos {
 namespace Impl {
 
-template <class T, class = void>
-struct is_detected_execution_space : std::false_type {
-  using type = not_a_type;
-};
-
 template <class T>
-struct is_detected_execution_space<T, void_t<typename T::execution_space>>
-    : std::true_type {
-  using type = typename T::execution_space;
-};
+using execution_space_t = typename T::execution_space;
 
 template <class T>
-using detected_execution_space_t =
-    typename is_detected_execution_space<T>::type;
-
-template <class T, class = void>
-struct is_detected_device_type : std::false_type {
-  using type = not_a_type;
-};
-
-template <class T>
-struct is_detected_device_type<T, void_t<typename T::device_type>>
-    : std::true_type {
-  using type = typename T::device_type;
-};
-
-template <class T>
-using detected_device_type_t = typename is_detected_device_type<T>::type;
+using device_type_t = typename T::device_type;
 
 //----------------------------------------------------------------------------
 /** \brief  Given a Functor and Execution Policy query an execution space.
@@ -112,16 +85,14 @@ using detected_device_type_t = typename is_detected_device_type<T>::type;
 
 template <class Functor, class Policy>
 struct FunctorPolicyExecutionSpace {
-  using execution_space = std::conditional_t<
-      is_detected_execution_space<Policy>::value,
-      detected_execution_space_t<Policy>,
-      std::conditional_t<
-          is_detected_execution_space<Functor>::value,
-          detected_execution_space_t<Functor>,
+  using execution_space = detected_or_t<
+      detected_or_t<
           std::conditional_t<
-              is_detected_device_type<Functor>::value,
-              detected_execution_space_t<detected_device_type_t<Functor>>,
-              Kokkos::DefaultExecutionSpace>>>;
+              is_detected<device_type_t, Functor>::value,
+              detected_t<execution_space_t, detected_t<device_type_t, Functor>>,
+              Kokkos::DefaultExecutionSpace>,
+          execution_space_t, Functor>,
+      execution_space_t, Policy>;
 };
 
 }  // namespace Impl
@@ -158,8 +129,7 @@ inline void parallel_for(
     const ExecPolicy& policy, const FunctorType& functor,
     const std::string& str = "",
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<ExecPolicy>::value>::type* =
-        nullptr) {
+        Kokkos::is_execution_policy<ExecPolicy>::value>::type* = nullptr) {
   uint64_t kpID = 0;
 
   ExecPolicy inner_policy = policy;
@@ -200,18 +170,7 @@ inline void parallel_for(const size_t work_count, const FunctorType& functor,
 template <class ExecPolicy, class FunctorType>
 inline void parallel_for(const std::string& str, const ExecPolicy& policy,
                          const FunctorType& functor) {
-#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG Start parallel_for kernel: " << str << std::endl;
-#endif
-
   ::Kokkos::parallel_for(policy, functor, str);
-
-#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG End   parallel_for kernel: " << str << std::endl;
-#endif
-  (void)str;
 }
 
 }  // namespace Kokkos
@@ -255,9 +214,12 @@ namespace Kokkos {
 ///   // operator() or join().
 ///   using value_type = PodType;
 ///
-///   void operator () (const ExecPolicy::member_type & i, value_type& update,
-///   const bool final_pass) const; void init (value_type& update) const; void
-///   join (volatile value_type& update, volatile const value_type& input) const
+///   void operator () (const ExecPolicy::member_type & i,
+///                     value_type& update,
+///                     const bool final_pass) const;
+///   void init (value_type& update) const;
+///   void join (volatile value_type& update,
+//               volatile const value_type& input) const
 /// };
 /// \endcode
 ///
@@ -389,8 +351,7 @@ inline void parallel_scan(
     const ExecutionPolicy& policy, const FunctorType& functor,
     const std::string& str = "",
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<ExecutionPolicy>::value>::type* =
-        nullptr) {
+        Kokkos::is_execution_policy<ExecutionPolicy>::value>::type* = nullptr) {
   uint64_t kpID                = 0;
   ExecutionPolicy inner_policy = policy;
   Kokkos::Tools::Impl::begin_parallel_scan(inner_policy, functor, str, kpID);
@@ -430,18 +391,7 @@ inline void parallel_scan(const size_t work_count, const FunctorType& functor,
 template <class ExecutionPolicy, class FunctorType>
 inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy,
                           const FunctorType& functor) {
-#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG Start parallel_scan kernel: " << str << std::endl;
-#endif
-
   ::Kokkos::parallel_scan(policy, functor, str);
-
-#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG End parallel_scan kernel: " << str << std::endl;
-#endif
-  (void)str;
 }
 
 template <class ExecutionPolicy, class FunctorType, class ReturnType>
@@ -449,8 +399,7 @@ inline void parallel_scan(
     const ExecutionPolicy& policy, const FunctorType& functor,
     ReturnType& return_value, const std::string& str = "",
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<ExecutionPolicy>::value>::type* =
-        nullptr) {
+        Kokkos::is_execution_policy<ExecutionPolicy>::value>::type* = nullptr) {
   uint64_t kpID                = 0;
   ExecutionPolicy inner_policy = policy;
   Kokkos::Tools::Impl::begin_parallel_scan(inner_policy, functor, str, kpID);
@@ -464,7 +413,8 @@ inline void parallel_scan(
 
   Kokkos::Tools::Impl::end_parallel_scan(inner_policy, functor, str, kpID);
 
-  policy.space().fence();
+  policy.space().fence(
+      "Kokkos::parallel_scan: fence due to result being a value, not a view");
 }
 
 template <class FunctorType, class ReturnType>
@@ -491,25 +441,15 @@ inline void parallel_scan(const size_t work_count, const FunctorType& functor,
 
   Kokkos::Tools::Impl::end_parallel_scan(execution_policy, functor, str, kpID);
 
-  execution_space().fence();
+  execution_space().fence(
+      "Kokkos::parallel_scan: fence after scan with return value");
 }
 
 template <class ExecutionPolicy, class FunctorType, class ReturnType>
 inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy,
                           const FunctorType& functor,
                           ReturnType& return_value) {
-#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG Start parallel_scan kernel: " << str << std::endl;
-#endif
-
   ::Kokkos::parallel_scan(policy, functor, return_value, str);
-
-#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG End parallel_scan kernel: " << str << std::endl;
-#endif
-  (void)str;
 }
 
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
index 96242f99b0ca678e1ede6f148ae5d90a16127afe..bc613cea62b10a56f888baabbc16ed9258e041dd 100644
--- a/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
+++ b/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
@@ -811,7 +811,7 @@ struct ParallelReducePolicyType;
 template <class PolicyType, class FunctorType>
 struct ParallelReducePolicyType<
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<PolicyType>::value>::type,
+        Kokkos::is_execution_policy<PolicyType>::value>::type,
     PolicyType, FunctorType> {
   using policy_type = PolicyType;
   static PolicyType policy(const PolicyType& policy_) { return policy_; }
@@ -948,9 +948,10 @@ parallel_reduce_needs_fence(ExecutionSpace const&, ViewLike const&) {
 template <class ExecutionSpace, class... Args>
 struct ParallelReduceFence {
   template <class... ArgsDeduced>
-  static void fence(const ExecutionSpace& ex, ArgsDeduced&&... args) {
+  static void fence(const ExecutionSpace& ex, const std::string& name,
+                    ArgsDeduced&&... args) {
     if (Impl::parallel_reduce_needs_fence(ex, (ArgsDeduced &&) args...)) {
-      ex.fence();
+      ex.fence(name);
     }
   }
 };
@@ -974,7 +975,6 @@ struct ParallelReduceFence {
  *    void join( volatile       <podType> & update ,
  *               volatile const <podType> & input ) const ;
  *
- *    using has_final = true_type;
  *    void final( <podType> & update ) const ;
  *  };
  * \endcode
@@ -991,7 +991,6 @@ struct ParallelReduceFence {
  *    void join( volatile       <podType> update[] ,
  *               volatile const <podType> input[] ) const ;
  *
- *    using has_final = true_type;
  *    void final( <podType> update[] ) const ;
  *  };
  * \endcode
@@ -1001,24 +1000,30 @@ struct ParallelReduceFence {
 
 template <class PolicyType, class FunctorType, class ReturnType>
 inline typename std::enable_if<
-    Kokkos::Impl::is_execution_policy<PolicyType>::value>::type
+    Kokkos::is_execution_policy<PolicyType>::value>::type
 parallel_reduce(const std::string& label, const PolicyType& policy,
                 const FunctorType& functor, ReturnType& return_value) {
   Impl::ParallelReduceAdaptor<PolicyType, FunctorType, ReturnType>::execute(
       label, policy, functor, return_value);
-  Impl::ParallelReduceFence<typename PolicyType::execution_space,
-                            ReturnType>::fence(policy.space(), return_value);
+  Impl::ParallelReduceFence<typename PolicyType::execution_space, ReturnType>::
+      fence(
+          policy.space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 template <class PolicyType, class FunctorType, class ReturnType>
 inline typename std::enable_if<
-    Kokkos::Impl::is_execution_policy<PolicyType>::value>::type
+    Kokkos::is_execution_policy<PolicyType>::value>::type
 parallel_reduce(const PolicyType& policy, const FunctorType& functor,
                 ReturnType& return_value) {
   Impl::ParallelReduceAdaptor<PolicyType, FunctorType, ReturnType>::execute(
       "", policy, functor, return_value);
-  Impl::ParallelReduceFence<typename PolicyType::execution_space,
-                            ReturnType>::fence(policy.space(), return_value);
+  Impl::ParallelReduceFence<typename PolicyType::execution_space, ReturnType>::
+      fence(
+          policy.space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 template <class FunctorType, class ReturnType>
@@ -1030,7 +1035,10 @@ inline void parallel_reduce(const size_t& policy, const FunctorType& functor,
   Impl::ParallelReduceAdaptor<policy_type, FunctorType, ReturnType>::execute(
       "", policy_type(0, policy), functor, return_value);
   Impl::ParallelReduceFence<typename policy_type::execution_space, ReturnType>::
-      fence(typename policy_type::execution_space(), return_value);
+      fence(
+          typename policy_type::execution_space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 template <class FunctorType, class ReturnType>
@@ -1043,33 +1051,42 @@ inline void parallel_reduce(const std::string& label, const size_t& policy,
   Impl::ParallelReduceAdaptor<policy_type, FunctorType, ReturnType>::execute(
       label, policy_type(0, policy), functor, return_value);
   Impl::ParallelReduceFence<typename policy_type::execution_space, ReturnType>::
-      fence(typename policy_type::execution_space(), return_value);
+      fence(
+          typename policy_type::execution_space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 // ReturnValue as View or Reducer: take by copy to allow for inline construction
 
 template <class PolicyType, class FunctorType, class ReturnType>
 inline typename std::enable_if<
-    Kokkos::Impl::is_execution_policy<PolicyType>::value>::type
+    Kokkos::is_execution_policy<PolicyType>::value>::type
 parallel_reduce(const std::string& label, const PolicyType& policy,
                 const FunctorType& functor, const ReturnType& return_value) {
   ReturnType return_value_impl = return_value;
   Impl::ParallelReduceAdaptor<PolicyType, FunctorType, ReturnType>::execute(
       label, policy, functor, return_value_impl);
-  Impl::ParallelReduceFence<typename PolicyType::execution_space,
-                            ReturnType>::fence(policy.space(), return_value);
+  Impl::ParallelReduceFence<typename PolicyType::execution_space, ReturnType>::
+      fence(
+          policy.space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 template <class PolicyType, class FunctorType, class ReturnType>
 inline typename std::enable_if<
-    Kokkos::Impl::is_execution_policy<PolicyType>::value>::type
+    Kokkos::is_execution_policy<PolicyType>::value>::type
 parallel_reduce(const PolicyType& policy, const FunctorType& functor,
                 const ReturnType& return_value) {
   ReturnType return_value_impl = return_value;
   Impl::ParallelReduceAdaptor<PolicyType, FunctorType, ReturnType>::execute(
       "", policy, functor, return_value_impl);
-  Impl::ParallelReduceFence<typename PolicyType::execution_space,
-                            ReturnType>::fence(policy.space(), return_value);
+  Impl::ParallelReduceFence<typename PolicyType::execution_space, ReturnType>::
+      fence(
+          policy.space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 template <class FunctorType, class ReturnType>
@@ -1082,7 +1099,10 @@ inline void parallel_reduce(const size_t& policy, const FunctorType& functor,
   Impl::ParallelReduceAdaptor<policy_type, FunctorType, ReturnType>::execute(
       "", policy_type(0, policy), functor, return_value_impl);
   Impl::ParallelReduceFence<typename policy_type::execution_space, ReturnType>::
-      fence(typename policy_type::execution_space(), return_value);
+      fence(
+          typename policy_type::execution_space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 template <class FunctorType, class ReturnType>
@@ -1096,7 +1116,10 @@ inline void parallel_reduce(const std::string& label, const size_t& policy,
   Impl::ParallelReduceAdaptor<policy_type, FunctorType, ReturnType>::execute(
       label, policy_type(0, policy), functor, return_value_impl);
   Impl::ParallelReduceFence<typename policy_type::execution_space, ReturnType>::
-      fence(typename policy_type::execution_space(), return_value);
+      fence(
+          typename policy_type::execution_space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 // No Return Argument
@@ -1106,8 +1129,7 @@ inline void parallel_reduce(
     const std::string& label, const PolicyType& policy,
     const FunctorType& functor,
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<PolicyType>::value>::type* =
-        nullptr) {
+        Kokkos::is_execution_policy<PolicyType>::value>::type* = nullptr) {
   using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>;
   using value_type  = std::conditional_t<(ValueTraits::StaticValueSize != 0),
                                         typename ValueTraits::value_type,
@@ -1131,8 +1153,7 @@ template <class PolicyType, class FunctorType>
 inline void parallel_reduce(
     const PolicyType& policy, const FunctorType& functor,
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<PolicyType>::value>::type* =
-        nullptr) {
+        Kokkos::is_execution_policy<PolicyType>::value>::type* = nullptr) {
   using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>;
   using value_type  = std::conditional_t<(ValueTraits::StaticValueSize != 0),
                                         typename ValueTraits::value_type,
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_b.cpp b/packages/kokkos/core/src/Kokkos_Rank.hpp
similarity index 71%
rename from packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_b.cpp
rename to packages/kokkos/core/src/Kokkos_Rank.hpp
index 5eed2ca0d77b828b2431bfce0fe69c4da457bb95..3603e2860891758e489471683d34438f219f84d5 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_b.cpp
+++ b/packages/kokkos/core/src/Kokkos_Rank.hpp
@@ -42,5 +42,30 @@
 //@HEADER
 */
 
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewAPI_b.hpp>
+#ifndef KOKKOS_KOKKOS_RANK_HPP
+#define KOKKOS_KOKKOS_RANK_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Layout.hpp>  // Iterate
+
+namespace Kokkos {
+
+// Iteration Pattern
+template <unsigned N, Iterate OuterDir = Iterate::Default,
+          Iterate InnerDir = Iterate::Default>
+struct Rank {
+  static_assert(N != 0u, "Kokkos Error: rank 0 undefined");
+  static_assert(N != 1u,
+                "Kokkos Error: rank 1 is not a multi-dimensional range");
+  static_assert(N < 7u, "Kokkos Error: Unsupported rank...");
+
+  using iteration_pattern = Rank<N, OuterDir, InnerDir>;
+
+  static constexpr int rank                = N;
+  static constexpr Iterate outer_direction = OuterDir;
+  static constexpr Iterate inner_direction = InnerDir;
+};
+
+}  // end namespace Kokkos
+
+#endif  // KOKKOS_KOKKOS_RANK_HPP
diff --git a/packages/kokkos/core/src/Kokkos_SYCL.hpp b/packages/kokkos/core/src/Kokkos_SYCL.hpp
index 8ee76b43862fd6c54c42d98e081174f11d5e09e4..02095ff7b3f01f7558d8e154d1ebc49ff46d1241 100644
--- a/packages/kokkos/core/src/Kokkos_SYCL.hpp
+++ b/packages/kokkos/core/src/Kokkos_SYCL.hpp
@@ -83,7 +83,9 @@ class SYCL {
   SYCL();
   explicit SYCL(const sycl::queue&);
 
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept {
+    return m_space_instance->impl_get_instance_id();
+  }
 
   sycl::context sycl_context() const noexcept {
     return m_space_instance->m_queue->get_context();
@@ -110,7 +112,9 @@ class SYCL {
 
   /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */
   static void impl_static_fence();
+  static void impl_static_fence(const std::string&);
   void fence() const;
+  void fence(const std::string&) const;
 
   /// \brief Print configuration information to the given output stream.
   void print_configuration(std::ostream&, const bool detail = false);
@@ -165,6 +169,7 @@ class SYCLSpaceInitializer : public Kokkos::Impl::ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
@@ -181,6 +186,41 @@ struct DeviceTypeTraits<Kokkos::Experimental::SYCL> {
 }  // namespace Experimental
 }  // namespace Tools
 
+namespace Experimental {
+template <class... Args>
+std::vector<SYCL> partition_space(const SYCL& sycl_space, Args...) {
+#ifdef __cpp_fold_expressions
+  static_assert(
+      (... && std::is_arithmetic_v<Args>),
+      "Kokkos Error: partitioning arguments must be integers or floats");
+#endif
+
+  sycl::context context = sycl_space.sycl_context();
+  sycl::default_selector device_selector;
+  std::vector<SYCL> instances;
+  instances.reserve(sizeof...(Args));
+  for (unsigned int i = 0; i < sizeof...(Args); ++i)
+    instances.emplace_back(sycl::queue(context, device_selector));
+  return instances;
+}
+
+template <class T>
+std::vector<SYCL> partition_space(const SYCL& sycl_space,
+                                  std::vector<T>& weights) {
+  static_assert(
+      std::is_arithmetic<T>::value,
+      "Kokkos Error: partitioning arguments must be integers or floats");
+
+  sycl::context context = sycl_space.sycl_context();
+  sycl::default_selector device_selector;
+  std::vector<SYCL> instances;
+  instances.reserve(weights.size());
+  for (unsigned int i = 0; i < weights.size(); ++i)
+    instances.emplace_back(sycl::queue(context, device_selector));
+  return instances;
+}
+}  // namespace Experimental
+
 }  // namespace Kokkos
 
 #endif
diff --git a/packages/kokkos/core/src/Kokkos_SYCL_Space.hpp b/packages/kokkos/core/src/Kokkos_SYCL_Space.hpp
index 392ab0e59a7d01f42342318bb44aa172bcb4f705..15ef11024d53501356d8004c58fc94fbeca80227 100644
--- a/packages/kokkos/core/src/Kokkos_SYCL_Space.hpp
+++ b/packages/kokkos/core/src/Kokkos_SYCL_Space.hpp
@@ -49,12 +49,19 @@
 
 #ifdef KOKKOS_ENABLE_SYCL
 #include <Kokkos_Concepts.hpp>
+#include <Kokkos_HostSpace.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <SYCL/Kokkos_SYCL_Instance.hpp>
 #include <impl/Kokkos_SharedAlloc.hpp>
 #include <impl/Kokkos_Tools.hpp>
 
 namespace Kokkos {
+
+namespace Impl {
+template <typename T>
+struct is_sycl_type_space : public std::false_type {};
+}  // namespace Impl
+
 namespace Experimental {
 
 class SYCLDeviceUSMSpace {
@@ -118,9 +125,54 @@ class SYCLSharedUSMSpace {
  private:
   sycl::queue m_queue;
 };
+
+class SYCLHostUSMSpace {
+ public:
+  using execution_space = HostSpace::execution_space;
+  using memory_space    = SYCLHostUSMSpace;
+  using device_type     = Kokkos::Device<execution_space, memory_space>;
+  using size_type       = Impl::SYCLInternal::size_type;
+
+  SYCLHostUSMSpace();
+  explicit SYCLHostUSMSpace(sycl::queue queue);
+
+  void* allocate(const std::size_t arg_alloc_size) const;
+  void* allocate(const char* arg_label, const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const;
+
+  void deallocate(void* const arg_alloc_ptr,
+                  const std::size_t arg_alloc_size) const;
+  void deallocate(const char* arg_label, void* const arg_alloc_ptr,
+                  const size_t arg_alloc_size,
+                  const size_t arg_logical_size = 0) const;
+
+ private:
+  template <class, class, class, class>
+  friend class LogicalMemorySpace;
+
+ public:
+  static constexpr const char* name() { return "SYCLHostUSM"; };
+
+ private:
+  sycl::queue m_queue;
+};
+
 }  // namespace Experimental
 
 namespace Impl {
+
+template <>
+struct is_sycl_type_space<Kokkos::Experimental::SYCLDeviceUSMSpace>
+    : public std::true_type {};
+
+template <>
+struct is_sycl_type_space<Kokkos::Experimental::SYCLSharedUSMSpace>
+    : public std::true_type {};
+
+template <>
+struct is_sycl_type_space<Kokkos::Experimental::SYCLHostUSMSpace>
+    : public std::true_type {};
+
 static_assert(Kokkos::Impl::MemorySpaceAccess<
                   Kokkos::Experimental::SYCLDeviceUSMSpace,
                   Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
@@ -131,6 +183,11 @@ static_assert(Kokkos::Impl::MemorySpaceAccess<
                   Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
               "");
 
+static_assert(Kokkos::Impl::MemorySpaceAccess<
+                  Kokkos::Experimental::SYCLDeviceUSMSpace,
+                  Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
+              "");
+
 template <>
 struct MemorySpaceAccess<Kokkos::HostSpace,
                          Kokkos::Experimental::SYCLDeviceUSMSpace> {
@@ -148,6 +205,16 @@ struct MemorySpaceAccess<Kokkos::HostSpace,
   enum : bool { deepcopy = true };
 };
 
+template <>
+struct MemorySpaceAccess<Kokkos::HostSpace,
+                         Kokkos::Experimental::SYCLHostUSMSpace> {
+  // HostSpace::execution_space ==
+  // Experimental::SYCLHostUSMSpace::execution_space
+  enum : bool { assignable = true };
+  enum : bool { accessible = true };
+  enum : bool { deepcopy = true };
+};
+
 template <>
 struct MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
                          Kokkos::HostSpace> {
@@ -165,6 +232,18 @@ struct MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
   enum : bool { deepcopy = true };
 };
 
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                         Kokkos::Experimental::SYCLHostUSMSpace> {
+  // Experimental::SYCLDeviceUSMSpace::execution_space !=
+  // Experimental::SYCLHostUSMSpace::execution_space
+  enum : bool { assignable = false };
+  enum : bool {
+    accessible = true
+  };  // Experimental::SYCLDeviceUSMSpace::execution_space
+  enum : bool { deepcopy = true };
+};
+
 //----------------------------------------
 // SYCLSharedUSMSpace::execution_space == SYCL
 // SYCLSharedUSMSpace accessible to both SYCL and Host
@@ -191,17 +270,46 @@ struct MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace,
 };
 
 template <>
-struct MemorySpaceAccess<
-    Kokkos::Experimental::SYCLDeviceUSMSpace,
-    Kokkos::ScratchMemorySpace<Kokkos::Experimental::SYCL>> {
+struct MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace,
+                         Kokkos::Experimental::SYCLHostUSMSpace> {
+  // Experimental::SYCLSharedUSMSpace::execution_space !=
+  // Experimental::SYCLHostUSMSpace::execution_space
   enum : bool { assignable = false };
-  enum : bool { accessible = true };
-  enum : bool { deepcopy = false };
+  enum : bool {
+    accessible = true
+  };  // Experimental::SYCLSharedUSMSpace::execution_space
+  enum : bool { deepcopy = true };
+};
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
+                         Kokkos::HostSpace> {
+  enum : bool { assignable = false };  // Cannot access from SYCL
+  enum : bool {
+    accessible = true
+  };  // Experimental::SYCLHostUSMSpace::execution_space
+  enum : bool { deepcopy = true };
+};
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
+                         Kokkos::Experimental::SYCLDeviceUSMSpace> {
+  enum : bool { assignable = false };  // Cannot access from Host
+  enum : bool { accessible = false };
+  enum : bool { deepcopy = true };
+};
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
+                         Kokkos::Experimental::SYCLSharedUSMSpace> {
+  enum : bool { assignable = false };  // different execution_space
+  enum : bool { accessible = true };   // same accessibility
+  enum : bool { deepcopy = true };
 };
 
 template <>
 struct MemorySpaceAccess<
-    Kokkos::Experimental::SYCLSharedUSMSpace,
+    Kokkos::Experimental::SYCLDeviceUSMSpace,
     Kokkos::ScratchMemorySpace<Kokkos::Experimental::SYCL>> {
   enum : bool { assignable = false };
   enum : bool { accessible = true };
@@ -276,6 +384,37 @@ class SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, void>
       const RecordBase::function_type arg_dealloc = &base_t::deallocate);
 };
 
+template <>
+class SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace, void>
+    : public SharedAllocationRecordCommon<
+          Kokkos::Experimental::SYCLHostUSMSpace> {
+ private:
+  friend class SharedAllocationRecordCommon<
+      Kokkos::Experimental::SYCLHostUSMSpace>;
+  using base_t =
+      SharedAllocationRecordCommon<Kokkos::Experimental::SYCLHostUSMSpace>;
+  using RecordBase = SharedAllocationRecord<void, void>;
+
+  SharedAllocationRecord(const SharedAllocationRecord&) = delete;
+  SharedAllocationRecord(SharedAllocationRecord&&)      = delete;
+  SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
+  SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete;
+
+  static RecordBase s_root_record;
+
+  const Kokkos::Experimental::SYCLHostUSMSpace m_space;
+
+ protected:
+  ~SharedAllocationRecord();
+
+  SharedAllocationRecord() = default;
+
+  SharedAllocationRecord(
+      const Kokkos::Experimental::SYCLHostUSMSpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
+};
+
 }  // namespace Impl
 
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp b/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp
index 2eebf5365e71d2c5cf42c356951ccec9d041fe14..bb740cfb86a966aefd4ac1ab6c9233ab81e0a97d 100644
--- a/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp
@@ -148,10 +148,10 @@ class ScratchMemorySpace {
                                             const IntType& size_L0,
                                             void* ptr_L1           = nullptr,
                                             const IntType& size_L1 = 0)
-      : m_iter_L0((char*)ptr_L0),
-        m_iter_L1((char*)ptr_L1),
-        m_end_L0((char*)ptr_L0 + size_L0),
-        m_end_L1((char*)ptr_L1 + size_L1),
+      : m_iter_L0(static_cast<char*>(ptr_L0)),
+        m_iter_L1(static_cast<char*>(ptr_L1)),
+        m_end_L0(static_cast<char*>(ptr_L0) + size_L0),
+        m_end_L1(static_cast<char*>(ptr_L1) + size_L1),
         m_multiplier(1),
         m_offset(0),
         m_default_level(0) {}
diff --git a/packages/kokkos/core/src/Kokkos_Serial.hpp b/packages/kokkos/core/src/Kokkos_Serial.hpp
index 4d5bb2410bfaabf6f752acf55795c9d7ef82016d..9c8ae70721e58bff5a91da3b99e3630ffe0c273a 100644
--- a/packages/kokkos/core/src/Kokkos_Serial.hpp
+++ b/packages/kokkos/core/src/Kokkos_Serial.hpp
@@ -53,6 +53,8 @@
 
 #include <cstddef>
 #include <iosfwd>
+#include <mutex>
+#include <thread>
 #include <Kokkos_Core_fwd.hpp>
 #include <Kokkos_Parallel.hpp>
 #include <Kokkos_TaskScheduler.hpp>
@@ -60,12 +62,12 @@
 #include <Kokkos_HostSpace.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_MemoryTraits.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_HostThreadTeam.hpp>
 #include <impl/Kokkos_FunctorAnalysis.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_Tools.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
+#include <impl/Kokkos_HostSharedPtr.hpp>
 
 #include <KokkosExp_MDRangePolicy.hpp>
 
@@ -73,6 +75,32 @@
 
 namespace Kokkos {
 
+namespace Impl {
+class SerialInternal {
+ public:
+  SerialInternal() = default;
+
+  bool is_initialized();
+
+  void initialize();
+
+  void finalize();
+
+  static SerialInternal& singleton();
+
+  std::mutex m_thread_team_data_mutex;
+
+  // Resize thread team data scratch memory
+  void resize_thread_team_data(size_t pool_reduce_bytes,
+                               size_t team_reduce_bytes,
+                               size_t team_shared_bytes,
+                               size_t thread_local_bytes);
+
+  HostThreadTeamData m_thread_team_data;
+  bool m_is_initialized = false;
+};
+}  // namespace Impl
+
 /// \class Serial
 /// \brief Kokkos device for non-parallel execution
 ///
@@ -107,6 +135,8 @@ class Serial {
 
   //@}
 
+  Serial();
+
   /// \brief True if and only if this method is being called in a
   ///   thread-parallel function.
   ///
@@ -121,9 +151,26 @@ class Serial {
   /// return asynchronously, before the functor completes.  This
   /// method does not return until all dispatched functors on this
   /// device have completed.
-  static void impl_static_fence() {}
+  static void impl_static_fence() {
+    impl_static_fence(
+        "Kokkos::Serial::impl_static_fence: Unnamed Static Fence");
+  }
+  static void impl_static_fence(const std::string& name) {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Serial>(
+        name,
+        Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+            GlobalDeviceSynchronization,
+        []() {});  // TODO: correct device ID
+    Kokkos::memory_fence();
+  }
 
-  void fence() const {}
+  void fence() const { fence("Kokkos::Serial::fence: Unnamed Instance Fence"); }
+  void fence(const std::string& name) const {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Serial>(
+        name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1},
+        []() {});  // TODO: correct device ID
+    Kokkos::memory_fence();
+  }
 
   /** \brief  Return the maximum amount of concurrency.  */
   static int concurrency() { return 1; }
@@ -153,9 +200,24 @@ class Serial {
     return impl_thread_pool_size(0);
   }
 
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept { return 1; }
 
   static const char* name();
+
+  Impl::SerialInternal* impl_internal_space_instance() const {
+#ifdef KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS
+    return m_space_instance;
+#else
+    return m_space_instance.get();
+#endif
+  }
+
+ private:
+#ifdef KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS
+  Impl::SerialInternal* m_space_instance;
+#else
+  Kokkos::Impl::HostSharedPtr<Impl::SerialInternal> m_space_instance;
+#endif
   //--------------------------------------------------------------------------
 };
 
@@ -177,6 +239,7 @@ class SerialSpaceInitializer : public ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
@@ -206,20 +269,6 @@ struct MemorySpaceAccess<Kokkos::Serial::memory_space,
 namespace Kokkos {
 namespace Impl {
 
-// Resize thread team data scratch memory
-void serial_resize_thread_team_data(size_t pool_reduce_bytes,
-                                    size_t team_reduce_bytes,
-                                    size_t team_shared_bytes,
-                                    size_t thread_local_bytes);
-
-HostThreadTeamData* serial_get_thread_team_data();
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-namespace Kokkos {
-namespace Impl {
-
 /*
  * < Kokkos::Serial , WorkArgTag >
  * < WorkArgTag , Impl::enable_if< std::is_same< Kokkos::Serial ,
@@ -510,13 +559,19 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     const size_t team_shared_size  = 0;  // Never shrinks
     const size_t thread_local_size = 0;  // Never shrinks
 
-    serial_resize_thread_team_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
-
-    HostThreadTeamData& data = *serial_get_thread_team_data();
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    // Need to lock resize_thread_team_data
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
 
     pointer_type ptr =
-        m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
+        m_result_ptr
+            ? m_result_ptr
+            : pointer_type(
+                  internal_instance->m_thread_team_data.pool_reduce_local());
 
     reference_type update =
         ValueInit::init(ReducerConditional::select(m_functor, m_reducer), ptr);
@@ -606,13 +661,18 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
     const size_t team_shared_size  = 0;  // Never shrinks
     const size_t thread_local_size = 0;  // Never shrinks
 
-    serial_resize_thread_team_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
+    // Need to lock resize_thread_team_data
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
 
-    HostThreadTeamData& data = *serial_get_thread_team_data();
-
-    reference_type update =
-        ValueInit::init(m_functor, pointer_type(data.pool_reduce_local()));
+    reference_type update = ValueInit::init(
+        m_functor,
+        pointer_type(
+            internal_instance->m_thread_team_data.pool_reduce_local()));
 
     this->template exec<WorkTag>(update);
   }
@@ -667,13 +727,18 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
     const size_t team_shared_size  = 0;  // Never shrinks
     const size_t thread_local_size = 0;  // Never shrinks
 
-    serial_resize_thread_team_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
+    // Need to lock resize_thread_team_data
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
 
-    HostThreadTeamData& data = *serial_get_thread_team_data();
-
-    reference_type update =
-        ValueInit::init(m_functor, pointer_type(data.pool_reduce_local()));
+    reference_type update = ValueInit::init(
+        m_functor,
+        pointer_type(
+            internal_instance->m_thread_team_data.pool_reduce_local()));
 
     this->template exec<WorkTag>(update);
 
@@ -797,13 +862,19 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     const size_t team_shared_size  = 0;  // Never shrinks
     const size_t thread_local_size = 0;  // Never shrinks
 
-    serial_resize_thread_team_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
-
-    HostThreadTeamData& data = *serial_get_thread_team_data();
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    // Need to lock resize_thread_team_data
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
 
     pointer_type ptr =
-        m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
+        m_result_ptr
+            ? m_result_ptr
+            : pointer_type(
+                  internal_instance->m_thread_team_data.pool_reduce_local());
 
     reference_type update =
         ValueInit::init(ReducerConditional::select(m_functor, m_reducer), ptr);
@@ -869,6 +940,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   using Member = typename Policy::member_type;
 
   const FunctorType m_functor;
+  const Policy m_policy;
   const int m_league;
   const int m_shared;
 
@@ -896,16 +968,21 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     const size_t team_shared_size  = m_shared;
     const size_t thread_local_size = 0;  // Never shrinks
 
-    serial_resize_thread_team_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
-
-    HostThreadTeamData& data = *serial_get_thread_team_data();
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    // Need to lock resize_thread_team_data
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
 
-    this->template exec<typename Policy::work_tag>(data);
+    this->template exec<typename Policy::work_tag>(
+        internal_instance->m_thread_team_data);
   }
 
   ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
       : m_functor(arg_functor),
+        m_policy(arg_policy),
         m_league(arg_policy.league_size()),
         m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
                  FunctorTeamShmemSize<FunctorType>::value(arg_functor, 1)) {}
@@ -941,6 +1018,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   using reference_type = typename Analysis::reference_type;
 
   const FunctorType m_functor;
+  const Policy m_policy;
   const int m_league;
   const ReducerType m_reducer;
   pointer_type m_result_ptr;
@@ -973,18 +1051,24 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     const size_t team_shared_size  = m_shared;
     const size_t thread_local_size = 0;  // Never shrinks
 
-    serial_resize_thread_team_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
-
-    HostThreadTeamData& data = *serial_get_thread_team_data();
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    // Need to lock resize_thread_team_data
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
 
     pointer_type ptr =
-        m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
+        m_result_ptr
+            ? m_result_ptr
+            : pointer_type(
+                  internal_instance->m_thread_team_data.pool_reduce_local());
 
     reference_type update =
         ValueInit::init(ReducerConditional::select(m_functor, m_reducer), ptr);
 
-    this->template exec<WorkTag>(data, update);
+    this->template exec<WorkTag>(internal_instance->m_thread_team_data, update);
 
     Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
         ReducerConditional::select(m_functor, m_reducer), ptr);
@@ -998,6 +1082,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                                   !Kokkos::is_reducer_type<ReducerType>::value,
                               void*>::type = nullptr)
       : m_functor(arg_functor),
+        m_policy(arg_policy),
         m_league(arg_policy.league_size()),
         m_reducer(InvalidType()),
         m_result_ptr(arg_result.data()),
@@ -1016,6 +1101,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
                         const ReducerType& reducer)
       : m_functor(arg_functor),
+        m_policy(arg_policy),
         m_league(arg_policy.league_size()),
         m_reducer(reducer),
         m_result_ptr(reducer.view().data()),
diff --git a/packages/kokkos/core/src/Kokkos_TaskPolicy.hpp b/packages/kokkos/core/src/Kokkos_TaskPolicy.hpp
index 91e079a0e78e314cdb4b22a42876564f25143a4c..9751fab460d4495b1683a94fc0e9d7f879c9a412 100644
--- a/packages/kokkos/core/src/Kokkos_TaskPolicy.hpp
+++ b/packages/kokkos/core/src/Kokkos_TaskPolicy.hpp
@@ -43,5 +43,9 @@
 */
 
 // For backward compatibility:
+#include <Kokkos_Macros.hpp>
+
+KOKKOS_IMPL_WARNING(
+    "This file is deprecated. Use <Kokkos_TaskScheduler.hpp> instead.")
 
 #include <Kokkos_TaskScheduler.hpp>
diff --git a/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp b/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp
index 743273670c9b5fa77f6d590596eb27fc7204396a..17e78f5e81fe83c940eea1bcd6c1d6c347649a4b 100644
--- a/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp
+++ b/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp
@@ -55,7 +55,6 @@
 //----------------------------------------------------------------------------
 
 #include <Kokkos_MemoryPool.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 #include <Kokkos_Future.hpp>
 #include <impl/Kokkos_TaskQueue.hpp>
@@ -372,7 +371,10 @@ class BasicTaskScheduler : public Impl::TaskSchedulerBase {
         task_base* const t = arg[i].m_task;
         if (nullptr != t) {
           // Increment reference count to track subsequent assignment.
-          Kokkos::atomic_increment(&(t->m_ref_count));
+          // This likely has to be SeqCst
+          Kokkos::Impl::desul_atomic_inc(&(t->m_ref_count),
+                                         Kokkos::Impl::MemoryOrderSeqCst(),
+                                         Kokkos::Impl::MemoryScopeDevice());
           if (q != static_cast<queue_type const*>(t->m_queue)) {
             Kokkos::abort(
                 "Kokkos when_all Futures must be in the same scheduler");
@@ -467,7 +469,10 @@ class BasicTaskScheduler : public Impl::TaskSchedulerBase {
           //  scheduler" );
           //}
           // Increment reference count to track subsequent assignment.
-          Kokkos::atomic_increment(&(arg_f.m_task->m_ref_count));
+          // This increment likely has to be SeqCst
+          Kokkos::Impl::desul_atomic_inc(&(arg_f.m_task->m_ref_count),
+                                         Kokkos::Impl::MemoryOrderSeqCst(),
+                                         Kokkos::Impl::MemoryScopeDevice());
           dep[i] = arg_f.m_task;
         }
       }
diff --git a/packages/kokkos/core/src/Kokkos_Threads.hpp b/packages/kokkos/core/src/Kokkos_Threads.hpp
index e827c2a2a1abd46999360c1eef57eb85428436aa..da9bea9c2347faca7b8e5944cb08a0783983f285 100644
--- a/packages/kokkos/core/src/Kokkos_Threads.hpp
+++ b/packages/kokkos/core/src/Kokkos_Threads.hpp
@@ -57,7 +57,6 @@
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_MemoryTraits.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
 
 /*--------------------------------------------------------------------------*/
@@ -65,6 +64,7 @@
 namespace Kokkos {
 namespace Impl {
 class ThreadsExec;
+enum class fence_is_static { yes, no };
 }  // namespace Impl
 }  // namespace Kokkos
 
@@ -108,8 +108,10 @@ class Threads {
   /// method does not return until all dispatched functors on this
   /// device have completed.
   static void impl_static_fence();
+  static void impl_static_fence(const std::string& name);
 
   void fence() const;
+  void fence(const std::string&) const;
 
   /** \brief  Return the maximum amount of concurrency.  */
   static int concurrency();
@@ -167,7 +169,7 @@ class Threads {
     return impl_thread_pool_rank();
   }
 
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept { return 1; }
 
   static const char* name();
   //@}
@@ -192,6 +194,7 @@ class ThreadsSpaceInitializer : public ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
diff --git a/packages/kokkos/core/src/Kokkos_Tuners.hpp b/packages/kokkos/core/src/Kokkos_Tuners.hpp
index f7cc34cc114d29cbe5612bf4350fe01a498282c3..52edd82052f4cfb3919b4733e4acb167780eaf8e 100644
--- a/packages/kokkos/core/src/Kokkos_Tuners.hpp
+++ b/packages/kokkos/core/src/Kokkos_Tuners.hpp
@@ -306,7 +306,11 @@ class MultidimensionalSparseTuningProblem {
   static constexpr size_t max_space_dimension_size = MaxDimensionSize;
   static constexpr double tuning_min               = 0.0;
   static constexpr double tuning_max               = 0.999;
-  static constexpr double tuning_step = tuning_max / max_space_dimension_size;
+
+  // Not declared as static constexpr to work around the following compiler bug
+  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96862
+  // where a floating-point expression cannot be constexpr under -frounding-math
+  double tuning_step = tuning_max / max_space_dimension_size;
 
   using StoredProblemSpace =
       typename Impl::MapTypeConverter<ProblemSpaceInput>::type;
@@ -315,17 +319,45 @@ class MultidimensionalSparseTuningProblem {
 
   using ValueArray = std::array<Kokkos::Tools::Experimental::VariableValue,
                                 space_dimensionality>;
+  template <class Key, class Value>
+  using extended_map = std::map<Key, Value>;
+  template <typename Key>
+  using extended_problem =
+      MultidimensionalSparseTuningProblem<extended_map, MaxDimensionSize, Key,
+                                          ProblemSpaceInput>;
+  template <typename Key, typename Value>
+  using ExtendedProblemSpace =
+      typename Impl::MapTypeConverter<extended_map<Key, Value>>::type;
+
+  template <typename Key>
+  auto extend(const std::string& axis_name,
+              const std::vector<Key>& new_tuning_axis) const
+      -> extended_problem<Key> {
+    ExtendedProblemSpace<Key, ProblemSpaceInput> extended_space;
+    for (auto& key : new_tuning_axis) {
+      extended_space.add_root_value(key);
+      extended_space.add_sub_container(m_space);
+    }
+    std::vector<std::string> extended_names;
+    extended_names.reserve(m_variable_names.size() + 1);
+    extended_names.push_back(axis_name);
+    extended_names.insert(extended_names.end(), m_variable_names.begin(),
+                          m_variable_names.end());
+    return extended_problem<Key>(extended_space, extended_names);
+  }
 
  private:
   StoredProblemSpace m_space;
   std::array<size_t, space_dimensionality> variable_ids;
+  std::vector<std::string> m_variable_names;
   size_t context;
 
  public:
   MultidimensionalSparseTuningProblem() = default;
-  MultidimensionalSparseTuningProblem(ProblemSpaceInput space,
+
+  MultidimensionalSparseTuningProblem(StoredProblemSpace space,
                                       const std::vector<std::string>& names)
-      : m_space(HierarchyConstructor::build(space)) {
+      : m_space(std::move(space)), m_variable_names(names) {
     assert(names.size() == space_dimensionality);
     for (unsigned long x = 0; x < names.size(); ++x) {
       VariableInfo info;
@@ -340,6 +372,20 @@ class MultidimensionalSparseTuningProblem {
     }
   }
 
+  MultidimensionalSparseTuningProblem(ProblemSpaceInput space,
+                                      const std::vector<std::string>& names)
+      : MultidimensionalSparseTuningProblem(HierarchyConstructor::build(space),
+                                            names) {}
+
+  template <typename... Coordinates>
+  auto get_point(Coordinates... coordinates) {
+    using ArrayType = std::array<Kokkos::Tools::Experimental::VariableValue,
+                                 sizeof...(coordinates)>;
+    return Impl::get_point(
+        m_space, ArrayType({Kokkos::Tools::Experimental::make_variable_value(
+                     0, static_cast<double>(coordinates))...}));
+  }
+
   auto begin() {
     context = Kokkos::Tools::Experimental::get_new_context_id();
     ValueArray values;
@@ -349,12 +395,28 @@ class MultidimensionalSparseTuningProblem {
     }
     begin_context(context);
     request_output_values(context, space_dimensionality, values.data());
-    return get_point(m_space, values);
+    return Impl::get_point(m_space, values);
   }
 
   auto end() { end_context(context); }
 };
 
+template <typename Tuner>
+struct ExtendableTunerMixin {
+  template <typename Key>
+  auto combine(const std::string& axis_name,
+               const std::vector<Key>& new_axis) const {
+    const auto& sub_tuner = static_cast<const Tuner*>(this)->get_tuner();
+    return sub_tuner.extend(axis_name, new_axis);
+  }
+
+  template <typename... Coordinates>
+  auto get_point(Coordinates... coordinates) {
+    const auto& sub_tuner = static_cast<const Tuner*>(this)->get_tuner();
+    return sub_tuner.get_point(coordinates...);
+  }
+};
+
 template <size_t MaxDimensionSize = 100, template <class...> class Container,
           class... TemplateArguments>
 auto make_multidimensional_sparse_tuning_problem(
@@ -362,7 +424,8 @@ auto make_multidimensional_sparse_tuning_problem(
   return MultidimensionalSparseTuningProblem<Container, MaxDimensionSize,
                                              TemplateArguments...>(in, names);
 }
-class TeamSizeTuner {
+
+class TeamSizeTuner : public ExtendableTunerMixin<TeamSizeTuner> {
  private:
   using SpaceDescription = std::map<int64_t, std::vector<int64_t>>;
   using TunerType = decltype(make_multidimensional_sparse_tuning_problem<20>(
@@ -481,7 +544,7 @@ class TeamSizeTuner {
     }
   }
 
- private:
+  TunerType get_tuner() const { return tuner; }
 };
 
 namespace Impl {
@@ -501,7 +564,7 @@ void fill_tile(std::map<T, Mapped>& cont, int tile_size) {
 }  // namespace Impl
 
 template <int MDRangeRank>
-struct MDRangeTuner {
+struct MDRangeTuner : public ExtendableTunerMixin<MDRangeTuner<MDRangeRank>> {
  private:
   static constexpr int rank       = MDRangeRank;
   static constexpr int max_slices = 15;
@@ -548,8 +611,45 @@ struct MDRangeTuner {
       tuner.end();
     }
   }
+
+  TunerType get_tuner() const { return tuner; }
 };
 
+template <class Choice>
+struct CategoricalTuner {
+  using choice_list = std::vector<Choice>;
+  choice_list choices;
+  size_t context;
+  size_t tuning_variable_id;
+  CategoricalTuner(std::string name, choice_list m_choices)
+      : choices(m_choices) {
+    std::vector<int64_t> indices;
+    for (typename decltype(choices)::size_type x = 0; x < choices.size(); ++x) {
+      indices.push_back(x);
+    }
+    VariableInfo info;
+    info.category      = StatisticalCategory::kokkos_value_categorical;
+    info.valueQuantity = CandidateValueType::kokkos_value_set;
+    info.type          = ValueType::kokkos_value_int64;
+    info.candidates    = make_candidate_set(indices.size(), indices.data());
+    tuning_variable_id = declare_output_type(name, info);
+  }
+  const Choice& begin() {
+    context = get_new_context_id();
+    begin_context(context);
+    VariableValue value = make_variable_value(tuning_variable_id, int64_t(0));
+    request_output_values(context, 1, &value);
+    return choices[value.value.int_value];
+  }
+  void end() { end_context(context); }
+};
+
+template <typename Choice>
+auto make_categorical_tuner(std::string name, std::vector<Choice> choices)
+    -> CategoricalTuner<Choice> {
+  return CategoricalTuner<Choice>(name, choices);
+}
+
 }  // namespace Experimental
 }  // namespace Tools
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/Kokkos_View.hpp b/packages/kokkos/core/src/Kokkos_View.hpp
index 1abe0a48df5eab32f01ef703e6d39921eb9c70c3..b217cc4bc171a94ce3fdeea9ce6301c70c106da9 100644
--- a/packages/kokkos/core/src/Kokkos_View.hpp
+++ b/packages/kokkos/core/src/Kokkos_View.hpp
@@ -190,9 +190,9 @@ struct ViewTraits<void, void, Prop...> {
 };
 
 template <class ArrayLayout, class... Prop>
-struct ViewTraits<typename std::enable_if<
-                      Kokkos::Impl::is_array_layout<ArrayLayout>::value>::type,
-                  ArrayLayout, Prop...> {
+struct ViewTraits<
+    typename std::enable_if<Kokkos::is_array_layout<ArrayLayout>::value>::type,
+    ArrayLayout, Prop...> {
   // Specify layout, keep subsequent space and memory traits arguments
 
   using execution_space = typename ViewTraits<void, Prop...>::execution_space;
@@ -204,9 +204,8 @@ struct ViewTraits<typename std::enable_if<
 };
 
 template <class Space, class... Prop>
-struct ViewTraits<
-    typename std::enable_if<Kokkos::Impl::is_space<Space>::value>::type, Space,
-    Prop...> {
+struct ViewTraits<typename std::enable_if<Kokkos::is_space<Space>::value>::type,
+                  Space, Prop...> {
   // Specify Space, memory traits should be the only subsequent argument.
 
   static_assert(
@@ -230,8 +229,8 @@ struct ViewTraits<
 };
 
 template <class MemoryTraits, class... Prop>
-struct ViewTraits<typename std::enable_if<Kokkos::Impl::is_memory_traits<
-                      MemoryTraits>::value>::type,
+struct ViewTraits<typename std::enable_if<
+                      Kokkos::is_memory_traits<MemoryTraits>::value>::type,
                   MemoryTraits, Prop...> {
   // Specify memory trait, should not be any subsequent arguments
 
@@ -1543,7 +1542,8 @@ class View : public ViewTraits<DataType, Properties...> {
     // to avoid incomplete type errors from using Kokkos::Cuda directly.
     if (std::is_same<Kokkos::CudaUVMSpace,
                      typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::View<...>::View: fence before allocating UVM");
     }
 #endif
     //------------------------------------------------------------
@@ -1555,7 +1555,8 @@ class View : public ViewTraits<DataType, Properties...> {
 #if defined(KOKKOS_ENABLE_CUDA)
     if (std::is_same<Kokkos::CudaUVMSpace,
                      typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::View<...>::View: fence after allocating UVM");
     }
 #endif
     //------------------------------------------------------------
diff --git a/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
index bdc8993c398f2dd6d6b581008d1f0c8d3535d860..dbb557c13743fa79235ba3786a367b1ab2ac7adc 100644
--- a/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
+++ b/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
@@ -213,7 +213,9 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
       using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
       const closure_type closure(*this, policy_type(0, m_queue.size()));
       closure.execute();
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::WorkGraphPolicy::WorkGraphPolicy: fence after executing "
+          "graph init");
     }
 
     {  // execute-after counts
@@ -221,7 +223,9 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
       using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
       const closure_type closure(*this, policy_type(0, m_graph.entries.size()));
       closure.execute();
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::WorkGraphPolicy::WorkGraphPolicy: fence after executing "
+          "graph count");
     }
 
     {  // Scheduling ready tasks
@@ -229,7 +233,9 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
       using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
       const closure_type closure(*this, policy_type(0, m_graph.numRows()));
       closure.execute();
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::WorkGraphPolicy::WorkGraphPolicy: fence after executing "
+          "readied graph");
     }
   }
 };
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
index e530612a57f81dace23777bdf98670dd73a9d026..0d521479eef89121d30c33a41cebdfac4628646f 100644
--- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
@@ -447,7 +447,13 @@ OpenMP OpenMP::create_instance(...) { return OpenMP(); }
 
 int OpenMP::concurrency() { return Impl::g_openmp_hardware_max_threads; }
 
-void OpenMP::fence() const {}
+void OpenMP::fence() const {
+  fence("Kokkos::OpenMP::fence: Unnamed Instance Fence");
+}
+void OpenMP::fence(const std::string &name) const {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::OpenMP>(
+      name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, []() {});
+}
 
 namespace Impl {
 
@@ -474,6 +480,9 @@ void OpenMPSpaceInitializer::finalize(const bool) {
 }
 
 void OpenMPSpaceInitializer::fence() { Kokkos::OpenMP::impl_static_fence(); }
+void OpenMPSpaceInitializer::fence(const std::string &name) {
+  Kokkos::OpenMP::impl_static_fence(OpenMP(), name);
+}
 
 void OpenMPSpaceInitializer::print_configuration(std::ostream &msg,
                                                  const bool detail) {
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
index 82f049ed136119c28b4add24f1460831fec55b16..1191e49cbe6ecd8d03069288062c88e38e6b8830 100644
--- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
@@ -151,7 +151,14 @@ int OpenMP::impl_thread_pool_rank() noexcept {
 #endif
 }
 
-inline void OpenMP::impl_static_fence(OpenMP const& /*instance*/) noexcept {}
+inline void OpenMP::impl_static_fence(OpenMP const& /**instance*/,
+                                      const std::string& name) noexcept {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::OpenMP>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      []() {});
+}
 
 inline bool OpenMP::is_asynchronous(OpenMP const& /*instance*/) noexcept {
   return false;
@@ -213,8 +220,9 @@ void OpenMP::partition_master(F const& f, int num_partitions,
 
 namespace Experimental {
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 template <>
-class MasterLock<OpenMP> {
+class KOKKOS_DEPRECATED MasterLock<OpenMP> {
  public:
   void lock() { omp_set_lock(&m_lock); }
   void unlock() { omp_unset_lock(&m_lock); }
@@ -231,6 +239,7 @@ class MasterLock<OpenMP> {
  private:
   omp_lock_t m_lock;
 };
+#endif
 
 template <>
 class UniqueToken<OpenMP, UniqueTokenScope::Instance> {
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
index 2a4a7b1d53bd4785f26508fbc990148291bd9763..d9234e34191a39a84b34d9651975b09fa5801a35 100644
--- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
@@ -324,11 +324,6 @@ class TaskQueueSpecializationConstrained<
                 // count of 0 also. Otherwise, returns a task from another queue
                 // or `end` if one couldn't be popped
                 task = team_queue.attempt_to_steal_task();
-#if 0
-                if(task != no_more_tasks_sentinel && task != end) {
-                  std::printf("task stolen on rank %d\n", team_exec.league_rank());
-                }
-#endif
               }
 
               // If still tasks are still executing
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
index f13875b440b63b729a64615a20da0f597a85cf6e..7ff885ed86b94e942cce18ff2e7ff1ed664129fa 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
@@ -77,9 +77,10 @@ namespace Kokkos {
 namespace Impl {
 
 void OpenMPTargetExec::verify_is_process(const char* const label) {
-  if (omp_in_parallel()) {
+  // Fails if the current task is in a parallel region or is not on the host.
+  if (omp_in_parallel() && (!omp_is_initial_device())) {
     std::string msg(label);
-    msg.append(" ERROR: in parallel");
+    msg.append(" ERROR: in parallel or on device");
     Kokkos::Impl::throw_runtime_exception(msg);
   }
 }
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
index 0b65e0d4a4b2270fdf577b4fffc1a10835467a47..ccfc756213695df6479634a67405d7d89308dbb8 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
@@ -54,7 +54,10 @@
 // FIXME_OPENMPTARGET - Using this macro to implement a workaround for
 // hierarchical reducers. It avoids hitting the code path which we wanted to
 // write but doesn't work. undef'ed at the end.
+// Intel compilers prefer the non-workaround version.
+#ifndef KOKKOS_ARCH_INTEL_GPU
 #define KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
+#endif
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -66,10 +69,6 @@ template <class Reducer>
 struct OpenMPTargetReducerWrapper {
   using value_type = typename Reducer::value_type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   KOKKOS_INLINE_FUNCTION
   static void join(value_type&, const value_type&) {
     printf(
@@ -90,7 +89,6 @@ struct OpenMPTargetReducerWrapper {
         "Using a generic unknown Reducer for the OpenMPTarget backend is not "
         "implemented.");
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -99,10 +97,6 @@ struct OpenMPTargetReducerWrapper<Sum<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) { dest += src; }
@@ -116,7 +110,6 @@ struct OpenMPTargetReducerWrapper<Sum<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::sum();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -125,10 +118,6 @@ struct OpenMPTargetReducerWrapper<Prod<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) { dest *= src; }
@@ -142,7 +131,6 @@ struct OpenMPTargetReducerWrapper<Prod<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::prod();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -151,10 +139,6 @@ struct OpenMPTargetReducerWrapper<Min<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -170,7 +154,6 @@ struct OpenMPTargetReducerWrapper<Min<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::min();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -179,10 +162,6 @@ struct OpenMPTargetReducerWrapper<Max<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -199,7 +178,6 @@ struct OpenMPTargetReducerWrapper<Max<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::max();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -208,10 +186,6 @@ struct OpenMPTargetReducerWrapper<LAnd<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
     dest = dest && src;
@@ -226,7 +200,6 @@ struct OpenMPTargetReducerWrapper<LAnd<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::land();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -237,10 +210,6 @@ struct OpenMPTargetReducerWrapper<LOr<Scalar, Space>> {
 
   using result_view_type = Kokkos::View<value_type, Space>;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -256,7 +225,6 @@ struct OpenMPTargetReducerWrapper<LOr<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::lor();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -265,10 +233,6 @@ struct OpenMPTargetReducerWrapper<BAnd<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -284,7 +248,6 @@ struct OpenMPTargetReducerWrapper<BAnd<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::band();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -293,10 +256,6 @@ struct OpenMPTargetReducerWrapper<BOr<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -312,7 +271,6 @@ struct OpenMPTargetReducerWrapper<BOr<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::bor();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Index, class Space>
@@ -325,10 +283,6 @@ struct OpenMPTargetReducerWrapper<MinLoc<Scalar, Index, Space>> {
   // Required
   using value_type = ValLocScalar<scalar_type, index_type>;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -345,7 +299,6 @@ struct OpenMPTargetReducerWrapper<MinLoc<Scalar, Index, Space>> {
     val.val = reduction_identity<scalar_type>::min();
     val.loc = reduction_identity<index_type>::min();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Index, class Space>
@@ -358,10 +311,6 @@ struct OpenMPTargetReducerWrapper<MaxLoc<Scalar, Index, Space>> {
   // Required
   using value_type = ValLocScalar<scalar_type, index_type>;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
     if (src.val > dest.val) dest = src;
@@ -377,7 +326,6 @@ struct OpenMPTargetReducerWrapper<MaxLoc<Scalar, Index, Space>> {
     val.val = reduction_identity<scalar_type>::max();
     val.loc = reduction_identity<index_type>::min();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -389,10 +337,6 @@ struct OpenMPTargetReducerWrapper<MinMax<Scalar, Space>> {
   // Required
   using value_type = MinMaxScalar<scalar_type>;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -419,7 +363,6 @@ struct OpenMPTargetReducerWrapper<MinMax<Scalar, Space>> {
     val.max_val = reduction_identity<scalar_type>::max();
     val.min_val = reduction_identity<scalar_type>::min();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Index, class Space>
@@ -432,10 +375,6 @@ struct OpenMPTargetReducerWrapper<MinMaxLoc<Scalar, Index, Space>> {
   // Required
   using value_type = MinMaxLocScalar<scalar_type, index_type>;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -468,7 +407,6 @@ struct OpenMPTargetReducerWrapper<MinMaxLoc<Scalar, Index, Space>> {
     val.max_loc = reduction_identity<index_type>::min();
     val.min_loc = reduction_identity<index_type>::min();
   }
-#pragma omp end declare target
 };
 /*
 template<class ReducerType>
@@ -560,47 +498,20 @@ class OpenMPTargetExecTeamMember {
   void* m_glb_scratch;
   void* m_reduce_scratch;
 
-  /*
-  // Fan-in team threads, root of the fan-in which does not block returns true
-  inline
-  bool team_fan_in() const
-    {
-      memory_fence();
-      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! (
-  m_team_rank_rev & n ) ; n <<= 1 ) {
-
-        m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active );
-      }
-
-      if ( m_team_rank_rev ) {
-        m_exec.state_set( Rendezvous );
-        memory_fence();
-        m_exec.state_wait( Rendezvous );
-      }
-
-      return 0 == m_team_rank_rev ;
-    }
-
-  inline
-  void team_fan_out() const
-    {
-      memory_fence();
-      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! (
-  m_team_rank_rev & n ) ; n <<= 1 ) { m_exec.pool_rev( m_team_base_rev + j
-  )->state_set( Active ); memory_fence();
-      }
-    }
-  */
  public:
   KOKKOS_INLINE_FUNCTION
   const execution_space::scratch_memory_space& team_shmem() const {
     return m_team_shared.set_team_thread_mode(0, 1, 0);
   }
 
+  // set_team_thread_mode routine parameters for future understanding:
+  // first parameter - scratch level.
+  // second parameter - size multiplier for advancing scratch ptr after a
+  // request was serviced. third parameter - offset size multiplier from current
+  // scratch ptr when returning a ptr for a request.
   KOKKOS_INLINE_FUNCTION
   const execution_space::scratch_memory_space& team_scratch(int level) const {
-    return m_team_shared.set_team_thread_mode(level, 1,
-                                              m_team_scratch_size[level]);
+    return m_team_shared.set_team_thread_mode(level, 1, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -627,8 +538,9 @@ class OpenMPTargetExecTeamMember {
     using type =
         typename std::conditional<(sizeof(ValueType) < TEAM_REDUCE_SIZE),
                                   ValueType, void>::type;
-    type* team_scratch = reinterpret_cast<type*>(
-        ((char*)(m_glb_scratch) + TEAM_REDUCE_SIZE * omp_get_team_num()));
+    type* team_scratch =
+        reinterpret_cast<type*>(static_cast<char*>(m_glb_scratch) +
+                                TEAM_REDUCE_SIZE * omp_get_team_num());
 #pragma omp barrier
     if (team_rank() == thread_id) *team_scratch = value;
 #pragma omp barrier
@@ -656,7 +568,8 @@ class OpenMPTargetExecTeamMember {
 
     const int n_values = TEAM_REDUCE_SIZE / sizeof(value_type);
     type* team_scratch =
-        (type*)((char*)m_glb_scratch + TEAM_REDUCE_SIZE * omp_get_team_num());
+        reinterpret_cast<type*>(static_cast<char*>(m_glb_scratch) +
+                                TEAM_REDUCE_SIZE * omp_get_team_num());
     for (int i = m_team_rank; i < n_values; i += m_team_size) {
       team_scratch[i] = value_type();
     }
@@ -770,27 +683,24 @@ class OpenMPTargetExecTeamMember {
         m_shmem_block_index(shmem_block_index),
         m_glb_scratch(glb_scratch) {
     const int omp_tid = omp_get_thread_num();
-    m_team_shared     = scratch_memory_space(
-        ((char*)glb_scratch +
-         m_shmem_block_index *
-             (shmem_size_L0 + shmem_size_L1 +
-              ((shmem_size_L0 + shmem_size_L1) * 10 / 100) + TEAM_REDUCE_SIZE)),
-        shmem_size_L0,
-        ((char*)glb_scratch +
-         m_shmem_block_index * (shmem_size_L0 + shmem_size_L1 +
-                                ((shmem_size_L0 + shmem_size_L1) * 10 / 100) +
-                                TEAM_REDUCE_SIZE)) +
-            shmem_size_L0 + ((shmem_size_L0 + shmem_size_L1) * 10 / 100) +
-            TEAM_REDUCE_SIZE,
-        shmem_size_L1);
-    m_reduce_scratch =
-        (char*)glb_scratch +
-        shmem_block_index *
-            (shmem_size_L0 + shmem_size_L1 +
-             ((shmem_size_L0 + shmem_size_L1) * 10 / 100) + TEAM_REDUCE_SIZE);
-    m_league_rank = league_rank;
-    m_team_rank   = omp_tid;
-    m_vector_lane = 0;
+
+    // The scratch memory allocated is a sum of TEAM_REDUCE_SIZE, L0 shmem size
+    // and L1 shmem size. TEAM_REDUCE_SIZE = 512 bytes saved per team for
+    // hierarchical reduction. There is an additional 10% of the requested
+    // scratch memory allocated per team as padding. Hence the product with 0.1.
+    const int reduce_offset =
+        m_shmem_block_index *
+        (shmem_size_L0 + shmem_size_L1 +
+         ((shmem_size_L0 + shmem_size_L1) * 0.1) + TEAM_REDUCE_SIZE);
+    const int l0_offset = reduce_offset + TEAM_REDUCE_SIZE;
+    const int l1_offset = l0_offset + shmem_size_L0;
+    m_team_shared       = scratch_memory_space(
+        (static_cast<char*>(glb_scratch) + l0_offset), shmem_size_L0,
+        static_cast<char*>(glb_scratch) + l1_offset, shmem_size_L1);
+    m_reduce_scratch = static_cast<char*>(glb_scratch) + reduce_offset;
+    m_league_rank    = league_rank;
+    m_team_rank      = omp_tid;
+    m_vector_lane    = 0;
   }
 
   static inline int team_reduce_size() { return TEAM_REDUCE_SIZE; }
@@ -877,6 +787,16 @@ class TeamPolicyInternal<Kokkos::Experimental::OpenMPTarget, Properties...>
   friend class TeamPolicyInternal;
 
  public:
+  // FIXME_OPENMPTARGET : Currently this routine is a copy of the Cuda
+  // implementation, but this has to be tailored to be architecture specific.
+  inline static int scratch_size_max(int level) {
+    return (
+        level == 0 ? 1024 * 40 :  // 48kB is the max for CUDA, but we need some
+                                  // for team_member.reduce etc.
+            20 * 1024 *
+                1024);  // arbitrarily setting this to 20MB, for a Volta V100
+                        // that would give us about 3.2GB for 2 teams per SM
+  }
   inline bool impl_auto_team_size() const { return m_tune_team_size; }
   inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
   inline void impl_set_team_size(const size_t size) { m_team_size = size; }
@@ -884,9 +804,11 @@ class TeamPolicyInternal<Kokkos::Experimental::OpenMPTarget, Properties...>
     m_tune_vector_length = length;
   }
   inline int impl_vector_length() const { return m_vector_length; }
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   KOKKOS_DEPRECATED inline int vector_length() const {
     return impl_vector_length();
   }
+#endif
   inline int team_size() const { return m_team_size; }
   inline int league_size() const { return m_league_size; }
   inline size_t scratch_size(const int& level, int team_size_ = -1) const {
@@ -1245,21 +1167,12 @@ KOKKOS_INLINE_FUNCTION
       static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
 
 #pragma omp barrier
-  // These three lines all cause crash
   Impl::OpenMPTargetReducerWrapper<ReducerType>::init(TeamThread_scratch[0]);
-//  result.init(TeamThread_scratch[0]);
-//  Impl::OpenMPTargetReducerWrapper<ReducerType> red;
-//  red.init(TeamThread_scratch[0]);
 #pragma omp barrier
 
 #pragma omp for reduction(custominner : TeamThread_scratch[:1])
   for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-    ValueType tmp;
-    result.init(tmp);
-    lambda(i, tmp);
-    // This line causes a crash
-    Impl::OpenMPTargetReducerWrapper<ReducerType>::join(TeamThread_scratch[0],
-                                                        tmp);
+    lambda(i, TeamThread_scratch[0]);
   }
   result.reference() = TeamThread_scratch[0];
 }
@@ -1305,6 +1218,12 @@ KOKKOS_INLINE_FUNCTION
          i += team_size) {
       lambda(i, tmp2);
     }
+
+    // FIXME_OPENMPTARGET: Join should work but doesn't. Every threads gets a
+    // private TeamThread_scratch[0] and at the end of the for-loop the `join`
+    // operation is performed by OpenMP itself and hence the simple assignment
+    // works.
+    //    result.join(TeamThread_scratch[0], tmp2);
     TeamThread_scratch[0] = tmp2;
   }
 
@@ -1336,28 +1255,31 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce(
   static_assert(sizeof(ValueType) <=
                 Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
 
+  // FIXME_OPENMPTARGET: Still need to figure out how to get value_count here.
+  const int value_count = 1;
+
 #pragma omp barrier
   TeamThread_scratch[0] = init_result;
 #pragma omp barrier
 
-  if constexpr (std::is_arithmetic<ValueType>::value) {
-#pragma omp for reduction(+ : TeamThread_scratch[:1])
-    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-      ValueType tmp = ValueType();
-      lambda(i, tmp);
-      TeamThread_scratch[0] += tmp;
-    }
-  } else {
-#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
-
-#pragma omp for reduction(custom : TeamThread_scratch[:1])
-    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-      ValueType tmp = ValueType();
-      lambda(i, tmp);
-      join(TeamThread_scratch[0], tmp);
+#pragma omp for
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+    lambda(i, TeamThread_scratch[omp_get_num_threads() * value_count]);
+  }
+
+  // Reduce all partial results within a team.
+  const int team_size      = omp_get_num_threads();
+  int tree_neighbor_offset = 1;
+  do {
+#pragma omp for
+    for (int i = 0; i < team_size - tree_neighbor_offset;
+         i += 2 * tree_neighbor_offset) {
+      const int neighbor = i + tree_neighbor_offset;
+      join(lambda, &TeamThread_scratch[i * value_count],
+           &TeamThread_scratch[neighbor * value_count]);
     }
-  }
-
+    tree_neighbor_offset *= 2;
+  } while (tree_neighbor_offset < team_size);
   init_result = TeamThread_scratch[0];
 }
 
@@ -1402,7 +1324,6 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
 }
 
 }  // namespace Kokkos
-#undef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
 
 namespace Kokkos {
 /** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each
@@ -1530,8 +1451,7 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
-       i += loop_boundaries.increment) {
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) {
     lambda(i, scan_val, true);
   }
 }
@@ -1629,9 +1549,7 @@ KOKKOS_INLINE_FUNCTION
 
 #pragma omp for simd reduction(custom : TeamVector_scratch[:1])
   for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-    ValueType tmp = ValueType();
-    lambda(i, tmp);
-    TeamVector_scratch[0] += tmp;
+    lambda(i, TeamVector_scratch[0]);
   }
 
   result.reference() = TeamVector_scratch[0];
@@ -1686,7 +1604,9 @@ KOKKOS_INLINE_FUNCTION
 #endif  // KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
 }  // namespace Kokkos
 
+#ifdef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
 #undef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
+#endif
 
 namespace Kokkos {
 
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
index 4a79b72732dafb9bd93613723551ec7a9b01ddd1..e421edc5b4c108b03fb8680863184828df243dd7 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
@@ -59,7 +59,34 @@
 namespace Kokkos {
 namespace Experimental {
 namespace Impl {
-void OpenMPTargetInternal::fence() {}
+uint32_t OpenMPTargetInternal::impl_get_instance_id() const noexcept {
+  return m_instance_id;
+}
+
+void OpenMPTargetInternal::fence(openmp_fence_is_static is_static) {
+  fence(
+      "Kokkos::Experimental::Impl::OpenMPTargetInternal::fence: Unnamed "
+      "Internal Fence",
+      is_static);
+}
+void OpenMPTargetInternal::fence(const std::string& name,
+                                 openmp_fence_is_static is_static) {
+  if (is_static == openmp_fence_is_static::no) {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event<
+        Kokkos::Experimental::OpenMPTarget>(
+        name,
+        Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{
+            impl_get_instance_id()},
+        [&]() {});
+  } else {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event<
+        Kokkos::Experimental::OpenMPTarget>(
+        name,
+        Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+            GlobalDeviceSynchronization,
+        [&]() {});
+  }
+}
 int OpenMPTargetInternal::concurrency() { return 128000; }
 const char* OpenMPTargetInternal::name() { return "OpenMPTarget"; }
 void OpenMPTargetInternal::print_configuration(std::ostream& /*stream*/,
@@ -77,7 +104,18 @@ void OpenMPTargetInternal::impl_finalize() {
     Kokkos::kokkos_free<Kokkos::Experimental::OpenMPTargetSpace>(
         space.m_uniquetoken_ptr);
 }
-void OpenMPTargetInternal::impl_initialize() { m_is_initialized = true; }
+void OpenMPTargetInternal::impl_initialize() {
+  m_is_initialized = true;
+
+  // FIXME_OPENMPTARGET:  Only fix the number of teams for NVIDIA architectures
+  // from Pascal and upwards.
+#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
+    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
+  omp_set_num_teams(512);
+#endif
+#endif
+}
 int OpenMPTargetInternal::impl_is_initialized() {
   return m_is_initialized ? 1 : 0;
 }
@@ -100,11 +138,28 @@ void OpenMPTarget::print_configuration(std::ostream& stream,
   m_space_instance->print_configuration(stream, detail);
 }
 
+uint32_t OpenMPTarget::impl_instance_id() const noexcept {
+  return m_space_instance->impl_get_instance_id();
+}
+
 int OpenMPTarget::concurrency() {
   return Impl::OpenMPTargetInternal::impl_singleton()->concurrency();
 }
 void OpenMPTarget::fence() {
-  Impl::OpenMPTargetInternal::impl_singleton()->fence();
+  Impl::OpenMPTargetInternal::impl_singleton()->fence(
+      "Kokkos::OpenMPTarget::fence: Unnamed Instance Fence");
+}
+void OpenMPTarget::fence(const std::string& name) {
+  Impl::OpenMPTargetInternal::impl_singleton()->fence(name);
+}
+void OpenMPTarget::impl_static_fence() {
+  Impl::OpenMPTargetInternal::impl_singleton()->fence(
+      "Kokkos::OpenMPTarget::fence: Unnamed Instance Fence",
+      Kokkos::Experimental::Impl::openmp_fence_is_static::yes);
+}
+void OpenMPTarget::impl_static_fence(const std::string& name) {
+  Impl::OpenMPTargetInternal::impl_singleton()->fence(
+      name, Kokkos::Experimental::Impl::openmp_fence_is_static::yes);
 }
 
 void OpenMPTarget::impl_initialize() { m_space_instance->impl_initialize(); }
@@ -146,7 +201,10 @@ void OpenMPTargetSpaceInitializer::finalize(const bool all_spaces) {
 }
 
 void OpenMPTargetSpaceInitializer::fence() {
-  Kokkos::Experimental::OpenMPTarget::fence();
+  Kokkos::Experimental::OpenMPTarget::impl_static_fence();
+}
+void OpenMPTargetSpaceInitializer::fence(const std::string& name) {
+  Kokkos::Experimental::OpenMPTarget::impl_static_fence(name);
 }
 
 void OpenMPTargetSpaceInitializer::print_configuration(std::ostream& msg,
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp
index a1caf90c195b98511b4476db73345816af2b4669..b495771190e35c661df26a5ab3bc1d53a544cff7 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp
@@ -51,6 +51,8 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
+enum class openmp_fence_is_static { yes, no };
+
 class OpenMPTargetInternal {
  private:
   OpenMPTargetInternal()                            = default;
@@ -58,7 +60,9 @@ class OpenMPTargetInternal {
   OpenMPTargetInternal& operator=(const OpenMPTargetInternal&) = default;
 
  public:
-  void fence();
+  void fence(openmp_fence_is_static is_static = openmp_fence_is_static::no);
+  void fence(const std::string& name,
+             openmp_fence_is_static is_static = openmp_fence_is_static::no);
 
   /** \brief  Return the maximum amount of concurrency.  */
   int concurrency();
@@ -73,14 +77,16 @@ class OpenMPTargetInternal {
 
   //! Has been initialized
   int impl_is_initialized();
-
+  uint32_t impl_get_instance_id() const noexcept;
   //! Initialize, telling the CUDA run-time library which device to use.
   void impl_initialize();
 
   static OpenMPTargetInternal* impl_singleton();
 
  private:
-  bool m_is_initialized = false;
+  bool m_is_initialized  = false;
+  uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance<
+      Kokkos::Experimental::OpenMPTarget>(reinterpret_cast<uintptr_t>(this));
 };
 }  // Namespace Impl
 }  // Namespace Experimental
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
index a4092c3a37a7e9a1493576c5efe783334982a391..08a3109408bd88bcfa04f3fe31d09f5a6e1cff2c 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
@@ -51,8 +51,6 @@
 #include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 
-#define KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL
-
 namespace Kokkos {
 namespace Impl {
 
@@ -69,24 +67,10 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
   const Policy m_policy;
 
  public:
-  inline void execute() const { execute_impl<WorkTag>(); }
-  /*
-    template <class TagType>
-    inline typename std::enable_if<std::is_same<TagType, void>::value>::type
-    execute_impl() const {
-      OpenMPTargetExec::verify_is_process(
-          "Kokkos::Experimental::OpenMPTarget parallel_for");
-      OpenMPTargetExec::verify_initialized(
-          "Kokkos::Experimental::OpenMPTarget parallel_for");
-      const typename Policy::member_type begin = m_policy.begin();
-      const typename Policy::member_type end   = m_policy.end();
-
-  #pragma omp target teams distribute parallel for map(to: this->m_functor)
-      for (int i = begin; i < end; i++) m_functor(i);
-    }
-  */
+  void execute() const { execute_impl<WorkTag>(); }
+
   template <class TagType>
-  inline void execute_impl() const {
+  void execute_impl() const {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -98,16 +82,17 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
 
     FunctorType a_functor(m_functor);
 
-    if constexpr (std::is_same<TagType, void>::value) {
 #pragma omp target teams distribute parallel for map(to : a_functor)
-      for (auto i = begin; i < end; i++) a_functor(i);
-    } else {
-#pragma omp target teams distribute parallel for map(to : a_functor)
-      for (auto i = begin; i < end; i++) a_functor(TagType(), i);
+    for (auto i = begin; i < end; ++i) {
+      if constexpr (std::is_same<TagType, void>::value) {
+        a_functor(i);
+      } else {
+        a_functor(TagType(), i);
+      }
     }
   }
 
-  inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy)
+  ParallelFor(const FunctorType& arg_functor, Policy arg_policy)
       : m_functor(arg_functor), m_policy(arg_policy) {}
 };
 
@@ -120,12 +105,31 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
 namespace Kokkos {
 namespace Impl {
 
+// This class has the memcpy routine that is commonly used by ParallelReduce
+// over RangePolicy and TeamPolicy.
+template <class PointerType>
+struct ParallelReduceCommon {
+  // Copy the result back to device if the view is on the device.
+  static void memcpy_result(PointerType dest, PointerType src, size_t size,
+                            bool ptr_on_device) {
+    if (ptr_on_device) {
+      OMPT_SAFE_CALL(omp_target_memcpy(dest, src, size, 0, 0,
+                                       omp_get_default_device(),
+                                       omp_get_initial_device()));
+    } else {
+      *dest = *src;
+    }
+  }
+};
+
 template <class FunctorType, class PolicyType, class ReducerType,
-          class PointerType, class ValueType, bool FunctorHasJoin,
-          bool UseReducerType>
+          class PointerType, class ValueType>
 struct ParallelReduceSpecialize {
-  static inline void execute(const FunctorType& /*f*/, const PolicyType& /*p*/,
+  inline static void execute(const FunctorType& /*f*/, const PolicyType& /*p*/,
                              PointerType /*result_ptr*/) {
+    constexpr int FunctorHasJoin = ReduceFunctorHasJoin<FunctorType>::value;
+    constexpr int UseReducerType = is_reducer_type<ReducerType>::value;
+
     std::stringstream error_message;
     error_message << "Error: Invalid Specialization " << FunctorHasJoin << ' '
                   << UseReducerType << '\n';
@@ -137,12 +141,26 @@ struct ParallelReduceSpecialize {
 template <class FunctorType, class ReducerType, class PointerType,
           class ValueType, class... PolicyArgs>
 struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
-                                ReducerType, PointerType, ValueType, false,
-                                false> {
+                                ReducerType, PointerType, ValueType> {
   using PolicyType = Kokkos::RangePolicy<PolicyArgs...>;
-  template <class TagType>
-  inline static void execute_impl(const FunctorType& f, const PolicyType& p,
-                                  PointerType result_ptr) {
+  using TagType    = typename PolicyType::work_tag;
+  using ReducerTypeFwd =
+      typename std::conditional<std::is_same<InvalidType, ReducerType>::value,
+                                FunctorType, ReducerType>::type;
+  using WorkTagFwd =
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, TagType,
+                         void>;
+
+  using ValueTraits =
+      Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
+  using ValueInit     = Kokkos::Impl::FunctorValueInit<FunctorType, TagType>;
+  using ValueJoin     = Kokkos::Impl::FunctorValueJoin<FunctorType, TagType>;
+  using ReferenceType = typename ValueTraits::reference_type;
+
+  using ParReduceCommon = ParallelReduceCommon<PointerType>;
+
+  static void execute_reducer(const FunctorType& f, const PolicyType& p,
+                              PointerType result_ptr, bool ptr_on_device) {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -153,69 +171,220 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
     if (end <= begin) return;
 
     ValueType result = ValueType();
-    if constexpr (std::is_same<TagType, void>::value) {
-#pragma omp target teams distribute parallel for num_teams(512) \
-                map(to:f) map(tofrom:result) reduction(+: result)
-      for (auto i = begin; i < end; i++) f(i, result);
-    } else {
-#pragma omp target teams distribute parallel for num_teams(512) \
-                map(to:f) map(tofrom:result) reduction(+: result)
-      for (auto i = begin; i < end; i++) f(TagType(), i, result);
-    }
-
-    *result_ptr = result;
-  }
-
-  inline static void execute(const FunctorType& f, const PolicyType& p,
-                             PointerType ptr) {
-    execute_impl<typename PolicyType::work_tag>(f, p, ptr);
-  }
-};
 
-template <class FunctorType, class PolicyType, class ReducerType,
-          class PointerType, class ValueType>
-struct ParallelReduceSpecialize<FunctorType, PolicyType, ReducerType,
-                                PointerType, ValueType, false, true> {
 #pragma omp declare reduction(                                         \
     custom:ValueType                                                   \
     : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
     initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
 
-  template <class TagType>
-  inline static void execute_impl(const FunctorType& f, const PolicyType& p,
-                                  PointerType result_ptr) {
+    OpenMPTargetReducerWrapper<ReducerType>::init(result);
+#pragma omp target teams distribute parallel for map(to                    \
+                                                     : f) reduction(custom \
+                                                                    : result)
+    for (auto i = begin; i < end; ++i) {
+      if constexpr (std::is_same<TagType, void>::value) {
+        f(i, result);
+      } else {
+        f(TagType(), i, result);
+      }
+    }
+
+    ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType),
+                                   ptr_on_device);
+  }
+
+  template <class TagType, int NumReductions>
+  static void execute_array(const FunctorType& f, const PolicyType& p,
+                            PointerType result_ptr, bool ptr_on_device) {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
-    const typename PolicyType::member_type begin = p.begin();
-    const typename PolicyType::member_type end   = p.end();
+    const auto begin = p.begin();
+    const auto end   = p.end();
 
     if (end <= begin) return;
 
     ValueType result = ValueType();
-    OpenMPTargetReducerWrapper<ReducerType>::init(result);
 
-    if constexpr (std::is_same<TagType, void>::value) {
-#pragma omp target teams distribute parallel for num_teams(512) map(to   \
-                                                                    : f) \
-    reduction(custom                                                     \
-              : result)
-      for (auto i = begin; i < end; i++) f(i, result);
-      *result_ptr = result;
+    // Enter the loop if the reduction is on a scalar type.
+    if constexpr (NumReductions == 1) {
+      // Case where reduction is on a native data type.
+      if constexpr (std::is_arithmetic<ValueType>::value) {
+#pragma omp target teams distribute parallel for \
+         map(to:f) reduction(+: result)
+        for (auto i = begin; i < end; ++i)
+
+          if constexpr (std::is_same<TagType, void>::value) {
+            f(i, result);
+          } else {
+            f(TagType(), i, result);
+          }
+      } else {
+#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
+#pragma omp target teams distribute parallel for map(to                    \
+                                                     : f) reduction(custom \
+                                                                    : result)
+        for (auto i = begin; i < end; ++i)
+
+          if constexpr (std::is_same<TagType, void>::value) {
+            f(i, result);
+          } else {
+            f(TagType(), i, result);
+          }
+      }
     } else {
-#pragma omp target teams distribute parallel for num_teams(512) map(to   \
-                                                                    : f) \
-    reduction(custom                                                     \
-              : result)
-      for (auto i = begin; i < end; i++) f(TagType(), i, result);
-      *result_ptr = result;
+#pragma omp target teams distribute parallel for map(to:f) reduction(+:result[:NumReductions])
+      for (auto i = begin; i < end; ++i) {
+        if constexpr (std::is_same<TagType, void>::value) {
+          f(i, result);
+        } else {
+          f(TagType(), i, result);
+        }
+      }
     }
+
+    ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType),
+                                   ptr_on_device);
   }
 
-  inline static void execute(const FunctorType& f, const PolicyType& p,
-                             PointerType ptr) {
-    execute_impl<typename PolicyType::work_tag>(f, p, ptr);
+  static void execute_init_join(const FunctorType& f, const PolicyType& p,
+                                PointerType ptr, const bool ptr_on_device) {
+    const auto begin = p.begin();
+    const auto end   = p.end();
+
+    constexpr int HasInit = ReduceFunctorHasInit<FunctorType>::value;
+
+    // Initialize the result pointer.
+
+    const auto size = end - begin;
+
+    // FIXME_OPENMPTARGET: The team size and MAX_ACTIVE_THREADS are currently
+    // based on NVIDIA-V100 and should be modifid to be based on the
+    // architecture in the future.
+    const int max_team_threads = 32;
+    const int max_teams =
+        OpenMPTargetExec::MAX_ACTIVE_THREADS / max_team_threads;
+    // Number of elements in the reduction
+    const auto value_count =
+        FunctorValueTraits<FunctorType, TagType>::value_count(f);
+
+    // Allocate scratch per active thread. Achieved by setting the first
+    // parameter of `resize_scratch=1`.
+    OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType));
+    ValueType* scratch_ptr =
+        static_cast<ValueType*>(OpenMPTargetExec::get_scratch_ptr());
+
+#pragma omp target map(to : f) is_device_ptr(scratch_ptr)
+    {
+      // Enter this loop if the functor has an `init`
+      if constexpr (HasInit) {
+        // The `init` routine needs to be called on the device since it might
+        // need device members.
+        ValueInit::init(f, scratch_ptr);
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+          FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+      } else {
+        for (int i = 0; i < value_count; ++i) {
+          static_cast<ValueType*>(scratch_ptr)[i] = ValueType();
+        }
+
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+          FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+      }
+    }
+
+    if (end <= begin) {
+      // If there is no work to be done, copy back the initialized values and
+      // exit.
+      if (!ptr_on_device)
+        OMPT_SAFE_CALL(omp_target_memcpy(
+            ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+            omp_get_initial_device(), omp_get_default_device()));
+      else
+        OMPT_SAFE_CALL(omp_target_memcpy(
+            ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+            omp_get_default_device(), omp_get_default_device()));
+
+      return;
+    }
+
+#pragma omp target teams num_teams(max_teams) thread_limit(max_team_threads) \
+    map(to                                                                   \
+        : f) is_device_ptr(scratch_ptr)
+    {
+#pragma omp parallel
+      {
+        const int team_num    = omp_get_team_num();
+        const int num_teams   = omp_get_num_teams();
+        const auto chunk_size = size / num_teams;
+        const auto team_begin = begin + team_num * chunk_size;
+        const auto team_end =
+            (team_num == num_teams - 1) ? end : (team_begin + chunk_size);
+        ValueType* team_scratch =
+            scratch_ptr + team_num * max_team_threads * value_count;
+        ReferenceType result = ValueInit::init(
+            f, &team_scratch[omp_get_thread_num() * value_count]);
+
+        // Accumulate partial results in thread specific storage.
+#pragma omp for simd
+        for (auto i = team_begin; i < team_end; ++i) {
+          if constexpr (std::is_same<TagType, void>::value) {
+            f(i, result);
+          } else {
+            f(TagType(), i, result);
+          }
+        }
+
+        // Reduce all paritial results within a team.
+        const int team_size      = max_team_threads;
+        int tree_neighbor_offset = 1;
+        do {
+#pragma omp for simd
+          for (int i = 0; i < team_size - tree_neighbor_offset;
+               i += 2 * tree_neighbor_offset) {
+            const int neighbor = i + tree_neighbor_offset;
+            ValueJoin::join(f, &team_scratch[i * value_count],
+                            &team_scratch[neighbor * value_count]);
+          }
+          tree_neighbor_offset *= 2;
+        } while (tree_neighbor_offset < team_size);
+      }  // end parallel
+    }    // end target
+
+    int tree_neighbor_offset = 1;
+    do {
+#pragma omp target teams distribute parallel for simd map(to   \
+                                                          : f) \
+    is_device_ptr(scratch_ptr)
+      for (int i = 0; i < max_teams - tree_neighbor_offset;
+           i += 2 * tree_neighbor_offset) {
+        ValueType* team_scratch = scratch_ptr;
+        const int team_offset   = max_team_threads * value_count;
+        ValueJoin::join(
+            f, &team_scratch[i * team_offset],
+            &team_scratch[(i + tree_neighbor_offset) * team_offset]);
+
+        // If `final` is provided by the functor.
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value) {
+          // Do the final only once at the end.
+          if (tree_neighbor_offset * 2 >= max_teams &&
+              omp_get_team_num() == 0 && omp_get_thread_num() == 0)
+            FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+        }
+      }
+      tree_neighbor_offset *= 2;
+    } while (tree_neighbor_offset < max_teams);
+
+    // If the result view is on the host, copy back the values via memcpy.
+    if (!ptr_on_device)
+      OMPT_SAFE_CALL(omp_target_memcpy(
+          ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+          omp_get_initial_device(), omp_get_default_device()));
+    else
+      OMPT_SAFE_CALL(omp_target_memcpy(
+          ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+          omp_get_default_device(), omp_get_default_device()));
   }
 };
 
@@ -227,47 +396,77 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
 
   using WorkTag   = typename Policy::work_tag;
   using WorkRange = typename Policy::WorkRange;
-  using Member    = typename Policy::member_type;
 
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
+  using ReducerTypeFwd =
+      typename std::conditional<std::is_same<InvalidType, ReducerType>::value,
+                                FunctorType, ReducerType>::type;
   using WorkTagFwd =
       std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
                          void>;
 
-  // Static Assert WorkTag void if ReducerType not InvalidType
-
   using ValueTraits =
       Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
-  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
-
-  enum { HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
-  enum { UseReducer = is_reducer_type<ReducerType>::value };
 
   using pointer_type   = typename ValueTraits::pointer_type;
   using reference_type = typename ValueTraits::reference_type;
 
+  static constexpr int HasJoin    = ReduceFunctorHasJoin<FunctorType>::value;
+  static constexpr int UseReducer = is_reducer_type<ReducerType>::value;
+  static constexpr int IsArray    = std::is_pointer<reference_type>::value;
+
   using ParReduceSpecialize =
       ParallelReduceSpecialize<FunctorType, Policy, ReducerType, pointer_type,
-                               typename ValueTraits::value_type, HasJoin,
-                               UseReducer>;
+                               typename ValueTraits::value_type>;
 
   const FunctorType m_functor;
   const Policy m_policy;
   const ReducerType m_reducer;
   const pointer_type m_result_ptr;
+  bool m_result_ptr_on_device;
+  const int m_result_ptr_num_elems;
+  using TagType = typename Policy::work_tag;
 
  public:
-  inline void execute() const {
-    ParReduceSpecialize::execute(m_functor, m_policy, m_result_ptr);
+  void execute() const {
+    if constexpr (HasJoin) {
+      // Enter this loop if the Functor has a init-join.
+      ParReduceSpecialize::execute_init_join(m_functor, m_policy, m_result_ptr,
+                                             m_result_ptr_on_device);
+    } else if constexpr (UseReducer) {
+      // Enter this loop if the Functor is a reducer type.
+      ParReduceSpecialize::execute_reducer(m_functor, m_policy, m_result_ptr,
+                                           m_result_ptr_on_device);
+    } else if constexpr (IsArray) {
+      // Enter this loop if the reduction is on an array and the routine is
+      // templated over the size of the array.
+      if (m_result_ptr_num_elems <= 2) {
+        ParReduceSpecialize::template execute_array<TagType, 2>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 4) {
+        ParReduceSpecialize::template execute_array<TagType, 4>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 8) {
+        ParReduceSpecialize::template execute_array<TagType, 8>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 16) {
+        ParReduceSpecialize::template execute_array<TagType, 16>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 32) {
+        ParReduceSpecialize::template execute_array<TagType, 32>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else {
+        Kokkos::abort("array reduction length must be <= 32");
+      }
+    } else {
+      // This loop handles the basic scalar reduction.
+      ParReduceSpecialize::template execute_array<TagType, 1>(
+          m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+    }
   }
 
   template <class ViewType>
-  inline ParallelReduce(
-      const FunctorType& arg_functor, Policy arg_policy,
+  ParallelReduce(
+      const FunctorType& arg_functor, Policy& arg_policy,
       const ViewType& arg_result_view,
       typename std::enable_if<Kokkos::is_view<ViewType>::value &&
                                   !Kokkos::is_reducer_type<ReducerType>::value,
@@ -275,14 +474,23 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
       : m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(InvalidType()),
-        m_result_ptr(arg_result_view.data()) {}
-
-  inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
-                        const ReducerType& reducer)
+        m_result_ptr(arg_result_view.data()),
+        m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ViewType::memory_space>::accessible),
+        m_result_ptr_num_elems(arg_result_view.size()) {}
+
+  ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy,
+                 const ReducerType& reducer)
       : m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {}
+        m_result_ptr(reducer.view().data()),
+        m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible),
+        m_result_ptr_num_elems(reducer.view().size()) {}
 };
 
 }  // namespace Impl
@@ -318,20 +526,20 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
   const Policy m_policy;
 
   template <class TagType>
-  inline typename std::enable_if<std::is_same<TagType, void>::value>::type
+  typename std::enable_if<std::is_same<TagType, void>::value>::type
   call_with_tag(const FunctorType& f, const idx_type& idx, value_type& val,
                 const bool& is_final) const {
     f(idx, val, is_final);
   }
   template <class TagType>
-  inline typename std::enable_if<!std::is_same<TagType, void>::value>::type
+  typename std::enable_if<!std::is_same<TagType, void>::value>::type
   call_with_tag(const FunctorType& f, const idx_type& idx, value_type& val,
                 const bool& is_final) const {
     f(WorkTag(), idx, val, is_final);
   }
 
  public:
-  inline void impl_execute(
+  void impl_execute(
       Kokkos::View<value_type**, Kokkos::LayoutRight,
                    Kokkos::Experimental::OpenMPTargetSpace>
           element_values,
@@ -349,13 +557,13 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
 #pragma omp target teams distribute map(to                             \
                                         : a_functor) num_teams(nteams) \
     thread_limit(team_size)
-    for (idx_type team_id = 0; team_id < n_chunks; team_id++) {
+    for (idx_type team_id = 0; team_id < n_chunks; ++team_id) {
 #pragma omp parallel num_threads(team_size)
       {
         const idx_type local_offset = team_id * chunk_size;
 
 #pragma omp for
-        for (idx_type i = 0; i < chunk_size; i++) {
+        for (idx_type i = 0; i < chunk_size; ++i) {
           const idx_type idx = local_offset + i;
           value_type val;
           ValueInit::init(a_functor, &val);
@@ -366,7 +574,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
         if (omp_get_thread_num() == 0) {
           value_type sum;
           ValueInit::init(a_functor, &sum);
-          for (idx_type i = 0; i < chunk_size; i++) {
+          for (idx_type i = 0; i < chunk_size; ++i) {
             ValueJoin::join(a_functor, &sum, &element_values(team_id, i));
             element_values(team_id, i) = sum;
           }
@@ -377,7 +585,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
           if (Kokkos::atomic_fetch_add(&count(), 1) == n_chunks - 1) {
             value_type sum;
             ValueInit::init(a_functor, &sum);
-            for (idx_type i = 0; i < n_chunks; i++) {
+            for (idx_type i = 0; i < n_chunks; ++i) {
               ValueJoin::join(a_functor, &sum, &chunk_values(i));
               chunk_values(i) = sum;
             }
@@ -389,7 +597,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
 #pragma omp target teams distribute map(to                             \
                                         : a_functor) num_teams(nteams) \
     thread_limit(team_size)
-    for (idx_type team_id = 0; team_id < n_chunks; team_id++) {
+    for (idx_type team_id = 0; team_id < n_chunks; ++team_id) {
 #pragma omp parallel num_threads(team_size)
       {
         const idx_type local_offset = team_id * chunk_size;
@@ -400,7 +608,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
           ValueInit::init(a_functor, &offset_value);
 
 #pragma omp for
-        for (idx_type i = 0; i < chunk_size; i++) {
+        for (idx_type i = 0; i < chunk_size; ++i) {
           const idx_type idx = local_offset + i;
           value_type local_offset_value;
           if (i > 0) {
@@ -415,7 +623,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
     }
   }
 
-  inline void execute() const {
+  void execute() const {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -438,7 +646,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
 
   //----------------------------------------
 
-  inline ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy)
+  ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy)
       : m_functor(arg_functor), m_policy(arg_policy) {}
 
   //----------------------------------------
@@ -455,7 +663,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
   value_type& m_returnvalue;
 
  public:
-  inline void execute() const {
+  void execute() const {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -513,7 +721,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   const int m_shmem_size;
 
  public:
-  inline void execute() const {
+  void execute() const {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -523,7 +731,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
  private:
   template <class TagType>
-  inline void execute_impl() const {
+  void execute_impl() const {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -549,7 +757,6 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     const auto nteams =
         league_size < max_active_teams ? league_size : max_active_teams;
 
-#ifdef KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL
 // Performing our own scheduling of teams to avoid separation of code between
 // teams-distribute and parallel. Gave a 2x performance boost in test cases with
 // the clang compiler. atomic_compare_exchange can be avoided since the standard
@@ -580,49 +787,10 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
       } else
         Kokkos::abort("`num_teams` clause was not respected.\n");
     }
-
-#else
-// Saving the older implementation that uses `atomic_compare_exchange` to
-// calculate the shared memory block index and `distribute` clause to distribute
-// teams.
-#pragma omp target teams distribute map(to                   \
-                                        : a_functor)         \
-    is_device_ptr(scratch_ptr, lock_array) num_teams(nteams) \
-        thread_limit(team_size)
-    for (int i = 0; i < league_size; i++) {
-      int shmem_block_index = -1, lock_team = 99999, iter = -1;
-      iter = (omp_get_team_num() % max_active_teams);
-
-      // Loop as long as a shmem_block_index is not found.
-      while (shmem_block_index == -1) {
-        // Try and acquire a lock on the index.
-        lock_team = atomic_compare_exchange(&lock_array[iter], 0, 1);
-
-        // If lock is acquired assign it to the block index.
-        // lock_team = 0, implies atomic_compare_exchange is successfull.
-        if (lock_team == 0)
-          shmem_block_index = iter;
-        else
-          iter = ++iter % max_active_teams;
-      }
-
-#pragma omp parallel num_threads(team_size)
-      {
-        typename Policy::member_type team(
-            i, league_size, team_size, vector_length, scratch_ptr,
-            shmem_block_index, shmem_size_L0, shmem_size_L1);
-        m_functor(team);
-      }
-
-      // Free the locked block and increment the number of available free
-      // blocks.
-      lock_team = atomic_compare_exchange(&lock_array[shmem_block_index], 1, 0);
-    }
-#endif
   }
 
  public:
-  inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
+  ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
       : m_functor(arg_functor),
         m_policy(arg_policy),
         m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
@@ -633,13 +801,26 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
 template <class FunctorType, class ReducerType, class PointerType,
           class ValueType, class... PolicyArgs>
 struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
-                                ReducerType, PointerType, ValueType, false,
-                                false> {
+                                ReducerType, PointerType, ValueType> {
   using PolicyType = TeamPolicyInternal<PolicyArgs...>;
+  using TagType    = typename PolicyType::work_tag;
+  using ReducerTypeFwd =
+      typename std::conditional<std::is_same<InvalidType, ReducerType>::value,
+                                FunctorType, ReducerType>::type;
+  using WorkTagFwd =
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, TagType,
+                         void>;
 
-  template <class TagType>
-  inline static void execute_impl(const FunctorType& f, const PolicyType& p,
-                                  PointerType result_ptr) {
+  using ValueTraits =
+      Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
+  using ValueInit     = Kokkos::Impl::FunctorValueInit<FunctorType, TagType>;
+  using ValueJoin     = Kokkos::Impl::FunctorValueJoin<FunctorType, TagType>;
+  using ReferenceType = typename ValueTraits::reference_type;
+
+  using ParReduceCommon = ParallelReduceCommon<PointerType>;
+
+  static void execute_reducer(const FunctorType& f, const PolicyType& p,
+                              PointerType result_ptr, bool ptr_on_device) {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -662,11 +843,16 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
     const auto nteams =
         league_size < max_active_teams ? league_size : max_active_teams;
 
-#ifdef KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
 #pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
                                                                        : f) \
-    is_device_ptr(scratch_ptr) reduction(+: result)
-#pragma omp parallel reduction(+ : result)
+    is_device_ptr(scratch_ptr) reduction(custom                             \
+                                         : result)
+#pragma omp parallel reduction(custom : result)
     {
       const int blockIdx = omp_get_team_num();
       const int gridDim  = omp_get_num_teams();
@@ -687,79 +873,27 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
         Kokkos::abort("`num_teams` clause was not respected.\n");
     }
 
-    *result_ptr = result;
-#else
-// Saving the older implementation that uses `atomic_compare_exchange` to
-// calculate the shared memory block index and `distribute` clause to distribute
-// teams.
-#pragma omp target teams distribute num_teams(nteams) thread_limit(team_size) \
-         map(to:f) map(tofrom:result) reduction(+: result) \
-    is_device_ptr(scratch_ptr, lock_array)
-    for (int i = 0; i < league_size; i++) {
-      ValueType inner_result = ValueType();
-      int shmem_block_index = -1, lock_team = 99999, iter = -1;
-      iter = (omp_get_team_num() % max_active_teams);
-
-      // Loop as long as a shmem_block_index is not found.
-      while (shmem_block_index == -1) {
-        // Try and acquire a lock on the index.
-        lock_team = atomic_compare_exchange(&lock_array[iter], 0, 1);
-
-        // If lock is acquired assign it to the block index.
-        // lock_team = 0, implies atomic_compare_exchange is successfull.
-        if (lock_team == 0)
-          shmem_block_index = iter;
-        else
-          iter = ++iter % max_active_teams;
-      }
-#pragma omp parallel num_threads(team_size) reduction(+ : inner_result)
-      {
-        typename PolicyType::member_type team(
-            i, league_size, team_size, vector_length, scratch_ptr,
-            shmem_block_index, shmem_size_L0, shmem_size_L1);
-        f(team, inner_result);
-      }
-      result = inner_result;
-
-      // Free the locked block and increment the number of available free
-      // blocks.
-      lock_team = atomic_compare_exchange(&lock_array[shmem_block_index], 1, 0);
-    }
-
-    *result_ptr = result;
-#endif
-  }
-
-  inline static void execute(const FunctorType& f, const PolicyType& p,
-                             PointerType ptr) {
-    execute_impl<typename PolicyType::work_tag>(f, p, ptr);
+    // Copy results back to device if `parallel_reduce` is on a device view.
+    ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType),
+                                   ptr_on_device);
   }
-};
 
-template <class FunctorType, class ReducerType, class PointerType,
-          class ValueType, class... PolicyArgs>
-struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
-                                ReducerType, PointerType, ValueType, false,
-                                true> {
-  using PolicyType = TeamPolicyInternal<PolicyArgs...>;
-  template <class TagType>
-  inline static void execute_impl(const FunctorType& f, const PolicyType& p,
-                                  PointerType result_ptr) {
+  template <int NumReductions>
+  static void execute_array(const FunctorType& f, const PolicyType& p,
+                            PointerType result_ptr, bool ptr_on_device) {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
 
-#pragma omp declare reduction(                                         \
-    custom:ValueType                                                   \
-    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
-    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
-    const int league_size      = p.league_size();
-    const int team_size        = p.team_size();
-    const int vector_length    = p.impl_vector_length();
+    const int league_size   = p.league_size();
+    const int team_size     = p.team_size();
+    const int vector_length = p.impl_vector_length();
+
     const size_t shmem_size_L0 = p.scratch_size(0, team_size);
     const size_t shmem_size_L1 = p.scratch_size(1, team_size);
-    OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1);
+    OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE,
+                                     shmem_size_L0, shmem_size_L1);
     void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
 
     ValueType result = ValueType();
@@ -769,37 +903,229 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
     const auto nteams =
         league_size < max_active_teams ? league_size : max_active_teams;
 
+    // Case where the number of reduction items is 1.
+    if constexpr (NumReductions == 1) {
+      // Case where reduction is on a native data type.
+      if constexpr (std::is_arithmetic<ValueType>::value) {
+#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
+                                                                       : f) \
+    is_device_ptr(scratch_ptr) reduction(+: result)
+#pragma omp parallel reduction(+ : result)
+        {
+          const int blockIdx = omp_get_team_num();
+          const int gridDim  = omp_get_num_teams();
+
+          // Guarantee that the compilers respect the `num_teams` clause
+          if (gridDim <= nteams) {
+            for (int league_id = blockIdx; league_id < league_size;
+                 league_id += gridDim) {
+              typename PolicyType::member_type team(
+                  league_id, league_size, team_size, vector_length, scratch_ptr,
+                  blockIdx, shmem_size_L0, shmem_size_L1);
+              if constexpr (std::is_same<TagType, void>::value)
+                f(team, result);
+              else
+                f(TagType(), team, result);
+            }
+          } else
+            Kokkos::abort("`num_teams` clause was not respected.\n");
+        }
+      } else {
+        // Case where the reduction is on a non-native data type.
+#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
 #pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
                                                                        : f) \
     is_device_ptr(scratch_ptr) reduction(custom                             \
                                          : result)
 #pragma omp parallel reduction(custom : result)
-    {
-      const int blockIdx = omp_get_team_num();
-      const int gridDim  = omp_get_num_teams();
+        {
+          const int blockIdx = omp_get_team_num();
+          const int gridDim  = omp_get_num_teams();
+
+          // Guarantee that the compilers respect the `num_teams` clause
+          if (gridDim <= nteams) {
+            for (int league_id = blockIdx; league_id < league_size;
+                 league_id += gridDim) {
+              typename PolicyType::member_type team(
+                  league_id, league_size, team_size, vector_length, scratch_ptr,
+                  blockIdx, shmem_size_L0, shmem_size_L1);
+              if constexpr (std::is_same<TagType, void>::value)
+                f(team, result);
+              else
+                f(TagType(), team, result);
+            }
+          } else
+            Kokkos::abort("`num_teams` clause was not respected.\n");
+        }
+      }
+    } else {
+      // Case where the reduction is on an array.
+#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
+                                                                       : f) \
+    is_device_ptr(scratch_ptr) reduction(+ : result[:NumReductions])
+#pragma omp parallel reduction(+ : result[:NumReductions])
+      {
+        const int blockIdx = omp_get_team_num();
+        const int gridDim  = omp_get_num_teams();
+
+        // Guarantee that the compilers respect the `num_teams` clause
+        if (gridDim <= nteams) {
+          for (int league_id = blockIdx; league_id < league_size;
+               league_id += gridDim) {
+            typename PolicyType::member_type team(
+                league_id, league_size, team_size, vector_length, scratch_ptr,
+                blockIdx, shmem_size_L0, shmem_size_L1);
+            if constexpr (std::is_same<TagType, void>::value)
+              f(team, result);
+            else
+              f(TagType(), team, result);
+          }
+        } else
+          Kokkos::abort("`num_teams` clause was not respected.\n");
+      }
+    }
 
-      // Guarantee that the compilers respect the `num_teams` clause
-      if (gridDim <= nteams) {
-        for (int league_id = blockIdx; league_id < league_size;
-             league_id += gridDim) {
+    // Copy results back to device if `parallel_reduce` is on a device view.
+    ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType),
+                                   ptr_on_device);
+  }
+
+  // FIXME_OPENMPTARGET : This routine is a copy from `parallel_reduce` over
+  // RangePolicy. Need a new implementation.
+  static void execute_init_join(const FunctorType& f, const PolicyType& p,
+                                PointerType ptr, const bool ptr_on_device) {
+    const auto begin      = p.begin();
+    const auto end        = p.end();
+    constexpr int HasInit = ReduceFunctorHasInit<FunctorType>::value;
+
+    const auto size = end - begin;
+
+    const int league_size   = p.league_size();
+    const int team_size     = p.team_size();
+    const int vector_length = p.impl_vector_length();
+
+    const size_t shmem_size_L0 = p.scratch_size(0, team_size);
+    const size_t shmem_size_L1 = p.scratch_size(1, team_size);
+
+    // FIXME_OPENMPTARGET: This would oversubscribe scratch memory since we are
+    // already using the available scratch memory to create temporaries for each
+    // thread.
+    if constexpr ((shmem_size_L0 + shmem_size_L1) > 0) {
+      Kokkos::abort(
+          "OpenMPTarget: Scratch memory is not supported in `parallel_reduce` "
+          "over functors with init/join.");
+    }
+
+    // Maximum active teams possible.
+    int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size;
+    const auto nteams =
+        league_size < max_active_teams ? league_size : max_active_teams;
+
+    // Number of elements in the reduction
+    const auto value_count =
+        FunctorValueTraits<FunctorType, TagType>::value_count(f);
+
+    // Allocate scratch per active thread.
+    OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType));
+    void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
+
+    // Enter this loop if the functor has an `init`
+    if constexpr (HasInit) {
+      // The `init` routine needs to be called on the device since it might need
+      // device members.
+#pragma omp target map(to : f) is_device_ptr(scratch_ptr)
+      {
+        ValueInit::init(f, scratch_ptr);
+
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+          FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+      }
+    } else {
+#pragma omp target map(to : f) is_device_ptr(scratch_ptr)
+      {
+        for (int i = 0; i < value_count; ++i) {
+          static_cast<ValueType*>(scratch_ptr)[i] = ValueType();
+        }
+
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+          FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+      }
+    }
+
+    if (end <= begin) {
+      // If there is no work to be done, copy back the initialized values and
+      // exit.
+      if (!ptr_on_device)
+        OMPT_SAFE_CALL(omp_target_memcpy(
+            ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+            omp_get_initial_device(), omp_get_default_device()));
+      else
+        OMPT_SAFE_CALL(omp_target_memcpy(
+            ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+            omp_get_default_device(), omp_get_default_device()));
+
+      return;
+    }
+
+#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
+                                                                       : f) \
+    is_device_ptr(scratch_ptr)
+    {
+#pragma omp parallel
+      {
+        const int team_num      = omp_get_team_num();
+        const int num_teams     = omp_get_num_teams();
+        ValueType* team_scratch = static_cast<ValueType*>(scratch_ptr) +
+                                  team_num * team_size * value_count;
+        ReferenceType result = ValueInit::init(f, &team_scratch[0]);
+
+        for (int league_id = team_num; league_id < league_size;
+             league_id += num_teams) {
           typename PolicyType::member_type team(
               league_id, league_size, team_size, vector_length, scratch_ptr,
-              blockIdx, shmem_size_L0, shmem_size_L1);
-          if constexpr (std::is_same<TagType, void>::value)
+              team_num, shmem_size_L0, shmem_size_L1);
+          if constexpr (std::is_same<TagType, void>::value) {
             f(team, result);
-          else
+          } else {
             f(TagType(), team, result);
+          }
         }
-      } else
-        Kokkos::abort("`num_teams` clause was not respected.\n");
-    }
-
-    *result_ptr = result;
-  }
-
-  inline static void execute(const FunctorType& f, const PolicyType& p,
-                             PointerType ptr) {
-    execute_impl<typename PolicyType::work_tag>(f, p, ptr);
+      }  // end parallel
+    }    // end target
+
+    int tree_neighbor_offset = 1;
+    do {
+#pragma omp target teams distribute parallel for simd map(to   \
+                                                          : f) \
+    is_device_ptr(scratch_ptr)
+      for (int i = 0; i < nteams - tree_neighbor_offset;
+           i += 2 * tree_neighbor_offset) {
+        ValueType* team_scratch = scratch_ptr;
+        const int team_offset   = team_size * value_count;
+        ValueJoin::join(
+            f, &team_scratch[i * team_offset],
+            &team_scratch[(i + tree_neighbor_offset) * team_offset]);
+
+        // If `final` is provided by the functor.
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value) {
+          // Do the final only once at the end.
+          if (tree_neighbor_offset * 2 >= nteams && omp_get_team_num() == 0 &&
+              omp_get_thread_num() == 0)
+            FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+        }
+      }
+      tree_neighbor_offset *= 2;
+    } while (tree_neighbor_offset < nteams);
+
+    // If the result view is on the host, copy back the values via memcpy.
+    if (!ptr_on_device)
+      OMPT_SAFE_CALL(omp_target_memcpy(
+          ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+          omp_get_initial_device(), omp_get_default_device()));
+    else
+      OMPT_SAFE_CALL(omp_target_memcpy(
+          ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+          omp_get_default_device(), omp_get_default_device()));
   }
 };
 
@@ -813,11 +1139,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
   using WorkTag = typename Policy::work_tag;
   using Member  = typename Policy::member_type;
-
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
+  using ReducerTypeFwd =
+      typename std::conditional<std::is_same<InvalidType, ReducerType>::value,
+                                FunctorType, ReducerType>::type;
   using WorkTagFwd =
       std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
                          void>;
@@ -831,13 +1155,16 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   using reference_type = typename ValueTraits::reference_type;
   using value_type     = typename ValueTraits::value_type;
 
-  enum { HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
-  enum { UseReducer = is_reducer_type<ReducerType>::value };
+  bool m_result_ptr_on_device;
+  const int m_result_ptr_num_elems;
+
+  static constexpr int HasJoin    = ReduceFunctorHasJoin<FunctorType>::value;
+  static constexpr int UseReducer = is_reducer_type<ReducerType>::value;
+  static constexpr int IsArray    = std::is_pointer<reference_type>::value;
 
-  using ParForSpecialize =
+  using ParReduceSpecialize =
       ParallelReduceSpecialize<FunctorType, Policy, ReducerType, pointer_type,
-                               typename ValueTraits::value_type, HasJoin,
-                               UseReducer>;
+                               typename ValueTraits::value_type>;
 
   const FunctorType m_functor;
   const Policy m_policy;
@@ -846,18 +1173,50 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   const int m_shmem_size;
 
  public:
-  inline void execute() const {
-    ParForSpecialize::execute(m_functor, m_policy, m_result_ptr);
+  void execute() const {
+    if constexpr (HasJoin) {
+      ParReduceSpecialize::execute_init_join(m_functor, m_policy, m_result_ptr,
+                                             m_result_ptr_on_device);
+    } else if constexpr (UseReducer) {
+      ParReduceSpecialize::execute_reducer(m_functor, m_policy, m_result_ptr,
+                                           m_result_ptr_on_device);
+    } else if constexpr (IsArray) {
+      if (m_result_ptr_num_elems <= 2) {
+        ParReduceSpecialize::template execute_array<2>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 4) {
+        ParReduceSpecialize::template execute_array<4>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 8) {
+        ParReduceSpecialize::template execute_array<8>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 16) {
+        ParReduceSpecialize::template execute_array<16>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 32) {
+        ParReduceSpecialize::template execute_array<32>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else {
+        Kokkos::abort("array reduction length must be <= 32");
+      }
+    } else {
+      ParReduceSpecialize::template execute_array<1>(
+          m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+    }
   }
 
   template <class ViewType>
-  inline ParallelReduce(
+  ParallelReduce(
       const FunctorType& arg_functor, const Policy& arg_policy,
       const ViewType& arg_result,
       typename std::enable_if<Kokkos::is_view<ViewType>::value &&
                                   !Kokkos::is_reducer_type<ReducerType>::value,
                               void*>::type = nullptr)
-      : m_functor(arg_functor),
+      : m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ViewType::memory_space>::accessible),
+        m_result_ptr_num_elems(arg_result.size()),
+        m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(InvalidType()),
         m_result_ptr(arg_result.data()),
@@ -865,9 +1224,14 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                      FunctorTeamShmemSize<FunctorType>::value(
                          arg_functor, arg_policy.team_size())) {}
 
-  inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
-                        const ReducerType& reducer)
-      : m_functor(arg_functor),
+  ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy,
+                 const ReducerType& reducer)
+      : m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible),
+        m_result_ptr_num_elems(reducer.view().size()),
+        m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(reducer),
         m_result_ptr(reducer.view().data()),
@@ -889,11 +1253,11 @@ struct TeamThreadRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> {
   const iType end;
   const OpenMPTargetExecTeamMember& team;
 
-  inline TeamThreadRangeBoundariesStruct(
-      const OpenMPTargetExecTeamMember& thread_, iType count)
+  TeamThreadRangeBoundariesStruct(const OpenMPTargetExecTeamMember& thread_,
+                                  iType count)
       : start(0), end(count), team(thread_) {}
-  inline TeamThreadRangeBoundariesStruct(
-      const OpenMPTargetExecTeamMember& thread_, iType begin_, iType end_)
+  TeamThreadRangeBoundariesStruct(const OpenMPTargetExecTeamMember& thread_,
+                                  iType begin_, iType end_)
       : start(begin_), end(end_), team(thread_) {}
 };
 
@@ -904,12 +1268,11 @@ struct ThreadVectorRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> {
   const index_type end;
   const OpenMPTargetExecTeamMember& team;
 
-  inline ThreadVectorRangeBoundariesStruct(
-      const OpenMPTargetExecTeamMember& thread_, index_type count)
+  ThreadVectorRangeBoundariesStruct(const OpenMPTargetExecTeamMember& thread_,
+                                    index_type count)
       : start(0), end(count), team(thread_) {}
-  inline ThreadVectorRangeBoundariesStruct(
-      const OpenMPTargetExecTeamMember& thread_, index_type begin_,
-      index_type end_)
+  ThreadVectorRangeBoundariesStruct(const OpenMPTargetExecTeamMember& thread_,
+                                    index_type begin_, index_type end_)
       : start(begin_), end(end_), team(thread_) {}
 };
 
@@ -920,12 +1283,11 @@ struct TeamVectorRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> {
   const index_type end;
   const OpenMPTargetExecTeamMember& team;
 
-  inline TeamVectorRangeBoundariesStruct(
-      const OpenMPTargetExecTeamMember& thread_, index_type count)
+  TeamVectorRangeBoundariesStruct(const OpenMPTargetExecTeamMember& thread_,
+                                  index_type count)
       : start(0), end(count), team(thread_) {}
-  inline TeamVectorRangeBoundariesStruct(
-      const OpenMPTargetExecTeamMember& thread_, index_type begin_,
-      index_type end_)
+  TeamVectorRangeBoundariesStruct(const OpenMPTargetExecTeamMember& thread_,
+                                  index_type begin_, index_type end_)
       : start(begin_), end(end_), team(thread_) {}
 };
 
@@ -935,5 +1297,4 @@ struct TeamVectorRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> {
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-#undef KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL
 #endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp
index 3dfad2bb856e0bb65a48dfd70b3458cee4c9beb5..40d8c45f5d0f8ce6798079d9eb3deb48a4361122 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp
@@ -91,7 +91,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 
 #pragma omp target teams distribute map(to : functor) num_teams(end - begin)
     {
-      for (ptrdiff_t tile_idx = begin; tile_idx < end; tile_idx++) {
+      for (ptrdiff_t tile_idx = begin; tile_idx < end; ++tile_idx) {
 
 #pragma omp parallel
         {
@@ -116,31 +116,6 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 #endif
   }
 
-  template <int Rank>
-  inline typename std::enable_if<Rank == 1>::type execute_tile(
-      typename Policy::point_type offset, const FunctorType& functor,
-      const Policy& policy) const {
-#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES
-    (void)offset;
-    const auto begin_0 = policy.m_lower[0];
-
-    const auto end_0 = policy.m_upper[0];
-
-#pragma omp target teams distribute parallel for map(to : functor)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) {
-      functor(i0);
-    }
-#else
-    const ptrdiff_t begin_0 = offset[0];
-    ptrdiff_t end_0         = begin_0 + policy.m_tile[0];
-    end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0];
-#pragma omp for
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) {
-      functor(i0);
-    }
-#endif
-  }
-
   template <int Rank>
   inline typename std::enable_if<Rank == 2>::type execute_tile(
       typename Policy::point_type offset, const FunctorType& functor,
@@ -154,8 +129,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     const auto end_1 = policy.m_upper[1];
 
 #pragma omp target teams distribute parallel for collapse(2) map(to : functor)
-    for (auto i0 = begin_0; i0 < end_0; i0++) {
-      for (auto i1 = begin_1; i1 < end_1; i1++) {
+    for (auto i0 = begin_0; i0 < end_0; ++i0) {
+      for (auto i1 = begin_1; i1 < end_1; ++i1) {
         if constexpr (std::is_same<typename Policy::work_tag, void>::value)
           functor(i0, i1);
         else
@@ -172,8 +147,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1];
 
 #pragma omp for collapse(2)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++) {
+    for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0)
+      for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) {
         if constexpr (std::is_same<typename Policy::work_tag, void>::value)
           functor(i0, i1);
         else
@@ -197,9 +172,9 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     const auto end_2 = policy.m_upper[2];
 
 #pragma omp target teams distribute parallel for collapse(3) map(to : functor)
-    for (auto i0 = begin_0; i0 < end_0; i0++) {
-      for (auto i1 = begin_1; i1 < end_1; i1++) {
-        for (auto i2 = begin_2; i2 < end_2; i2++) {
+    for (auto i0 = begin_0; i0 < end_0; ++i0) {
+      for (auto i1 = begin_1; i1 < end_1; ++i1) {
+        for (auto i2 = begin_2; i2 < end_2; ++i2) {
           if constexpr (std::is_same<typename Policy::work_tag, void>::value)
             functor(i0, i1, i2);
           else
@@ -221,9 +196,9 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2];
 
 #pragma omp for collapse(3)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++) {
+    for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0)
+      for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1)
+        for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) {
           if constexpr (std::is_same<typename Policy::work_tag, void>::value)
             functor(i0, i1, i2);
           else
@@ -249,10 +224,10 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     const auto end_3 = policy.m_upper[3];
 
 #pragma omp target teams distribute parallel for collapse(4) map(to : functor)
-    for (auto i0 = begin_0; i0 < end_0; i0++) {
-      for (auto i1 = begin_1; i1 < end_1; i1++) {
-        for (auto i2 = begin_2; i2 < end_2; i2++) {
-          for (auto i3 = begin_3; i3 < end_3; i3++) {
+    for (auto i0 = begin_0; i0 < end_0; ++i0) {
+      for (auto i1 = begin_1; i1 < end_1; ++i1) {
+        for (auto i2 = begin_2; i2 < end_2; ++i2) {
+          for (auto i3 = begin_3; i3 < end_3; ++i3) {
             if constexpr (std::is_same<typename Policy::work_tag, void>::value)
               functor(i0, i1, i2, i3);
             else
@@ -279,10 +254,10 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3];
 
 #pragma omp for collapse(4)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++)
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++) {
+    for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0)
+      for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1)
+        for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2)
+          for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3) {
             if constexpr (std::is_same<typename Policy::work_tag, void>::value)
               functor(i0, i1, i2, i3);
             else
@@ -310,11 +285,11 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     const auto end_4 = policy.m_upper[4];
 
 #pragma omp target teams distribute parallel for collapse(5) map(to : functor)
-    for (auto i0 = begin_0; i0 < end_0; i0++) {
-      for (auto i1 = begin_1; i1 < end_1; i1++) {
-        for (auto i2 = begin_2; i2 < end_2; i2++) {
-          for (auto i3 = begin_3; i3 < end_3; i3++) {
-            for (auto i4 = begin_4; i4 < end_4; i4++) {
+    for (auto i0 = begin_0; i0 < end_0; ++i0) {
+      for (auto i1 = begin_1; i1 < end_1; ++i1) {
+        for (auto i2 = begin_2; i2 < end_2; ++i2) {
+          for (auto i3 = begin_3; i3 < end_3; ++i3) {
+            for (auto i4 = begin_4; i4 < end_4; ++i4) {
               if constexpr (std::is_same<typename Policy::work_tag,
                                          void>::value)
                 functor(i0, i1, i2, i3, i4);
@@ -347,11 +322,11 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4];
 
 #pragma omp for collapse(5)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++)
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++)
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++) {
+    for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0)
+      for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1)
+        for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2)
+          for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3)
+            for (ptrdiff_t i4 = begin_4; i4 < end_4; ++i4) {
               if constexpr (std::is_same<typename Policy::work_tag,
                                          void>::value)
                 functor(i0, i1, i2, i3, i4);
@@ -382,12 +357,12 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     const auto end_5 = policy.m_upper[5];
 
 #pragma omp target teams distribute parallel for collapse(6) map(to : functor)
-    for (auto i0 = begin_0; i0 < end_0; i0++) {
-      for (auto i1 = begin_1; i1 < end_1; i1++) {
-        for (auto i2 = begin_2; i2 < end_2; i2++) {
-          for (auto i3 = begin_3; i3 < end_3; i3++) {
-            for (auto i4 = begin_4; i4 < end_4; i4++) {
-              for (auto i5 = begin_5; i5 < end_5; i5++) {
+    for (auto i0 = begin_0; i0 < end_0; ++i0) {
+      for (auto i1 = begin_1; i1 < end_1; ++i1) {
+        for (auto i2 = begin_2; i2 < end_2; ++i2) {
+          for (auto i3 = begin_3; i3 < end_3; ++i3) {
+            for (auto i4 = begin_4; i4 < end_4; ++i4) {
+              for (auto i5 = begin_5; i5 < end_5; ++i5) {
                 {
                   if constexpr (std::is_same<typename Policy::work_tag,
                                              void>::value)
@@ -428,12 +403,12 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     end_5 = end_5 < policy.m_upper[5] ? end_5 : policy.m_upper[5];
 
 #pragma omp for collapse(6)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++)
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++)
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++)
-              for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++) {
+    for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0)
+      for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1)
+        for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2)
+          for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3)
+            for (ptrdiff_t i4 = begin_4; i4 < end_4; ++i4)
+              for (ptrdiff_t i5 = begin_5; i5 < end_5; ++i5) {
                 if constexpr (std::is_same<typename Policy::work_tag,
                                            void>::value)
                   functor(i0, i1, i2, i3, i4, i5);
@@ -443,195 +418,6 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 #endif
   }
 
-  template <int Rank>
-  inline typename std::enable_if<Rank == 7>::type execute_tile(
-      typename Policy::point_type offset, const FunctorType& functor,
-      const Policy& policy) const {
-#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES
-    (void)offset;
-    const int begin_0 = policy.m_lower[0];
-    const int begin_1 = policy.m_lower[1];
-    const int begin_2 = policy.m_lower[2];
-    const int begin_3 = policy.m_lower[3];
-    const int begin_4 = policy.m_lower[4];
-    const int begin_5 = policy.m_lower[5];
-    const int begin_6 = policy.m_lower[6];
-
-    const int end_0 = policy.m_upper[0];
-    const int end_1 = policy.m_upper[1];
-    const int end_2 = policy.m_upper[2];
-    const int end_3 = policy.m_upper[3];
-    const int end_4 = policy.m_upper[4];
-    const int end_5 = policy.m_upper[5];
-    const int end_6 = policy.m_upper[6];
-
-#pragma omp target teams distribute parallel for collapse(7) map(to : functor)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) {
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++) {
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++) {
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++) {
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++) {
-              for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++) {
-                for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++) {
-                  if constexpr (std::is_same<typename Policy::work_tag,
-                                             void>::value)
-                    functor(i0, i1, i2, i3, i4, i5, i6);
-                  else
-                    functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5,
-                            i6);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-#else
-    const ptrdiff_t begin_0 = offset[0];
-    ptrdiff_t end_0         = begin_0 + policy.m_tile[0];
-    end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0];
-
-    const ptrdiff_t begin_1 = offset[1];
-    ptrdiff_t end_1         = begin_1 + policy.m_tile[1];
-    end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1];
-
-    const ptrdiff_t begin_2 = offset[2];
-    ptrdiff_t end_2         = begin_2 + policy.m_tile[2];
-    end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2];
-
-    const ptrdiff_t begin_3 = offset[3];
-    ptrdiff_t end_3         = begin_3 + policy.m_tile[3];
-    end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3];
-
-    const ptrdiff_t begin_4 = offset[4];
-    ptrdiff_t end_4         = begin_4 + policy.m_tile[4];
-    end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4];
-
-    const ptrdiff_t begin_5 = offset[5];
-    ptrdiff_t end_5         = begin_5 + policy.m_tile[5];
-    end_5 = end_5 < policy.m_upper[5] ? end_5 : policy.m_upper[5];
-
-    const ptrdiff_t begin_6 = offset[6];
-    ptrdiff_t end_6         = begin_6 + policy.m_tile[6];
-    end_6 = end_6 < policy.m_upper[6] ? end_6 : policy.m_upper[6];
-
-#pragma omp for collapse(7)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++)
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++)
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++)
-              for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++)
-                for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++) {
-                  if constexpr (std::is_same<typename Policy::work_tag,
-                                             void>::value)
-                    functor(i0, i1, i2, i3, i4, i5, i6);
-                  else
-                    functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5,
-                            i6);
-                }
-#endif
-  }
-
-  template <int Rank>
-  inline typename std::enable_if<Rank == 8>::type execute_tile(
-      typename Policy::point_type offset, const FunctorType& functor,
-      const Policy& policy) const {
-#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES
-    (void)offset;
-    const int begin_0 = policy.m_lower[0];
-    const int begin_1 = policy.m_lower[1];
-    const int begin_2 = policy.m_lower[2];
-    const int begin_3 = policy.m_lower[3];
-    const int begin_4 = policy.m_lower[4];
-    const int begin_5 = policy.m_lower[5];
-    const int begin_6 = policy.m_lower[6];
-    const int begin_7 = policy.m_lower[7];
-
-    const int end_0 = policy.m_upper[0];
-    const int end_1 = policy.m_upper[1];
-    const int end_2 = policy.m_upper[2];
-    const int end_3 = policy.m_upper[3];
-    const int end_4 = policy.m_upper[4];
-    const int end_5 = policy.m_upper[5];
-    const int end_6 = policy.m_upper[6];
-    const int end_7 = policy.m_upper[7];
-
-#pragma omp target teams distribute parallel for collapse(8) map(to : functor)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) {
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++) {
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++) {
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++) {
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++) {
-              for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++) {
-                for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++) {
-                  for (ptrdiff_t i7 = begin_7; i7 < end_7; i7++) {
-                    if constexpr (std::is_same<typename Policy::work_tag,
-                                               void>::value)
-                      functor(i0, i1, i2, i3, i4, i5, i6, i7);
-                    else
-                      functor(typename Policy::work_tag(), i0, i1, i2, i3, i4,
-                              i5, i6, i7);
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-#else
-    const ptrdiff_t begin_0 = offset[0];
-    ptrdiff_t end_0         = begin_0 + policy.m_tile[0];
-    end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0];
-
-    const ptrdiff_t begin_1 = offset[1];
-    ptrdiff_t end_1         = begin_1 + policy.m_tile[1];
-    end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1];
-
-    const ptrdiff_t begin_2 = offset[2];
-    ptrdiff_t end_2         = begin_2 + policy.m_tile[2];
-    end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2];
-
-    const ptrdiff_t begin_3 = offset[3];
-    ptrdiff_t end_3         = begin_3 + policy.m_tile[3];
-    end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3];
-
-    const ptrdiff_t begin_4 = offset[4];
-    ptrdiff_t end_4         = begin_4 + policy.m_tile[4];
-    end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4];
-
-    const ptrdiff_t begin_5 = offset[5];
-    ptrdiff_t end_5         = begin_5 + policy.m_tile[5];
-    end_5 = end_5 < policy.m_upper[5] ? end_5 : policy.m_upper[5];
-
-    const ptrdiff_t begin_6 = offset[6];
-    ptrdiff_t end_6         = begin_6 + policy.m_tile[6];
-    end_6 = end_6 < policy.m_upper[6] ? end_6 : policy.m_upper[6];
-
-    const ptrdiff_t begin_7 = offset[7];
-    ptrdiff_t end_7         = begin_7 + policy.m_tile[7];
-    end_7 = end_7 < policy.m_upper[7] ? end_7 : policy.m_upper[7];
-
-#pragma omp for collapse(8)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++)
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++)
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++)
-              for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++)
-                for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++)
-                  for (ptrdiff_t i7 = begin_7; i7 < end_7; i7++) {
-                    if constexpr (std::is_same<typename Policy::work_tag,
-                                               void>::value)
-                      functor(i0, i1, i2, i3, i4, i5, i6, i7);
-                    else
-                      functor(typename Policy::work_tag(), i0, i1, i2, i3, i4,
-                              i5, i6, i7);
-                  }
-#endif
-  }
-
   inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy)
       : m_functor(arg_functor), m_policy(arg_policy) {}
   // TODO DZP: based on a conversation with Christian, we're using 256 as a
@@ -652,112 +438,6 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 namespace Kokkos {
 namespace Impl {
 
-template <class FunctorType, class ReducerType, class PointerType,
-          class ValueType, class... PolicyArgs>
-struct ParallelReduceSpecialize<FunctorType,
-                                Kokkos::MDRangePolicy<PolicyArgs...>,
-                                ReducerType, PointerType, ValueType, 0, 0> {
-  using PolicyType = Kokkos::RangePolicy<PolicyArgs...>;
-  template <class TagType>
-  inline static
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      execute_impl(const FunctorType& f, const PolicyType& p,
-                   PointerType result_ptr) {
-    OpenMPTargetExec::verify_is_process(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    OpenMPTargetExec::verify_initialized(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    const typename PolicyType::member_type begin = p.begin();
-    const typename PolicyType::member_type end   = p.end();
-
-    ValueType result = ValueType();
-#pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom:result) reduction(+: result)
-    for (int i = begin; i < end; i++) f(i, result);
-
-    *result_ptr = result;
-  }
-
-  template <class TagType>
-  inline static
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      execute_impl(const FunctorType& f, const PolicyType& p,
-                   PointerType result_ptr) {
-    OpenMPTargetExec::verify_is_process(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    OpenMPTargetExec::verify_initialized(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    const typename PolicyType::member_type begin = p.begin();
-    const typename PolicyType::member_type end   = p.end();
-
-    ValueType result = ValueType();
-#pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom: result) reduction(+: result)
-    for (int i = begin; i < end; i++) f(TagType(), i, result);
-
-    *result_ptr = result;
-  }
-
-  inline static void execute(const FunctorType& f, const PolicyType& p,
-                             PointerType ptr) {
-    execute_impl<typename PolicyType::work_tag>(f, p, ptr);
-  }
-};
-/*
-template<class FunctorType, class PolicyType, class ReducerType, class
-PointerType, class ValueType> struct ParallelReduceSpecialize<FunctorType,
-PolicyType, ReducerType, PointerType, ValueType, 0,1> {
-
-  #pragma omp declare reduction(custom: ValueType : ReducerType::join(omp_out,
-omp_in)) initializer ( ReducerType::init(omp_priv) )
-
-  template< class TagType >
-  inline static
-  typename std::enable_if< std::is_same< TagType , void >::value >::type
-  execute_impl(const FunctorType& f, const PolicyType& p, PointerType
-result_ptr)
-    {
-      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget
-parallel_for");
-      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget
-parallel_for"); const typename PolicyType::member_type begin = p.begin(); const
-typename PolicyType::member_type end = p.end();
-
-      ValueType result = ValueType();
-      #pragma omp target teams distribute parallel for num_teams(512) map(to:f)
-map(tofrom:result) reduction(custom: result) for(int i=begin; i<end; i++)
-        f(i,result);
-
-      *result_ptr=result;
-    }
-
-
-  template< class TagType >
-  inline static
-  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  execute_impl(const FunctorType& f, const PolicyType& p, PointerType
-result_ptr)
-    {
-      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget
-parallel_for");
-      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget
-parallel_for"); const typename PolicyType::member_type begin = p.begin(); const
-typename PolicyType::member_type end = p.end();
-
-      ValueType result = ValueType();
-      #pragma omp target teams distribute parallel for num_teams(512) map(to:f)
-map(tofrom: result) reduction(custom: result) for(int i=begin; i<end; i++)
-        f(TagType(),i,result);
-
-      *result_ptr=result;
-    }
-
-
-    inline static
-    void execute(const FunctorType& f, const PolicyType& p, PointerType ptr) {
-      execute_impl<typename PolicyType::work_tag>(f,p,ptr);
-    }
-};
-
-
 template <class FunctorType, class ReducerType, class... Traits>
 class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                      Kokkos::Experimental::OpenMPTarget> {
@@ -765,42 +445,38 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   using Policy = Kokkos::MDRangePolicy<Traits...>;
 
   using WorkTag = typename Policy::work_tag;
-  using WorkRange = typename Policy::WorkRange;
-  using Member = typename Policy::member_type;
+  using Member  = typename Policy::member_type;
 
   using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
+      std::conditional<std::is_same<InvalidType, ReducerType>::value,
+                       FunctorType, ReducerType>;
   using ReducerTypeFwd = typename ReducerConditional::type;
   using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                               WorkTag, void>::type;
-
-  // Static Assert WorkTag void if ReducerType not InvalidType
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
+                         void>;
 
   using ValueTraits =
       Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
-  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
-
-  enum { HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
-  enum { UseReducer = is_reducer_type<ReducerType>::value };
 
-  using pointer_type = typename ValueTraits::pointer_type;
+  using pointer_type   = typename ValueTraits::pointer_type;
   using reference_type = typename ValueTraits::reference_type;
 
-  using ParForSpecialize = ParallelReduceSpecialize<
-      FunctorType, Policy, ReducerType, pointer_type,
-      typename ValueTraits::value_type, HasJoin, UseReducer>;
+  enum { HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
+  enum { UseReducer = is_reducer_type<ReducerType>::value };
 
+  const pointer_type m_result_ptr;
   const FunctorType m_functor;
   const Policy m_policy;
   const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
+
+  using ParReduceCommon = ParallelReduceCommon<pointer_type>;
+
+  bool m_result_ptr_on_device;
 
  public:
   inline void execute() const {
-    ParForSpecialize::execute(m_functor, m_policy, m_result_ptr);
+    execute_tile<Policy::rank, typename ValueTraits::value_type>(
+        m_functor, m_policy, m_result_ptr);
   }
 
   template <class ViewType>
@@ -810,35 +486,345 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
       typename std::enable_if<Kokkos::is_view<ViewType>::value &&
                                   !Kokkos::is_reducer_type<ReducerType>::value,
                               void*>::type = NULL)
-      : m_functor(arg_functor),
+      : m_result_ptr(arg_result_view.data()),
+        m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(InvalidType()),
-        m_result_ptr(arg_result_view.data()) {
-    //static_assert( std::is_same< typename ViewType::memory_space
-    //                                , Kokkos::HostSpace >::value
-    //  , "Reduction result on Kokkos::Experimental::OpenMPTarget must be a
-    //  Kokkos::View in HostSpace" );
-  }
+        m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ViewType::memory_space>::accessible) {}
 
   inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
                         const ReducerType& reducer)
-      : m_functor(arg_functor),
+      : m_result_ptr(reducer.view().data()),
+        m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {
-    //static_assert( std::is_same< typename ViewType::memory_space
-    //                                , Kokkos::HostSpace >::value
-    //  , "Reduction result on Kokkos::Experimental::OpenMPTarget must be a
-    //  Kokkos::View in HostSpace" );
+        m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible) {}
+
+  template <int Rank, class ValueType>
+  inline typename std::enable_if<Rank == 2>::type execute_tile(
+      const FunctorType& functor, const Policy& policy,
+      pointer_type ptr) const {
+    const auto begin_0 = policy.m_lower[0];
+    const auto begin_1 = policy.m_lower[1];
+
+    const auto end_0 = policy.m_upper[0];
+    const auto end_1 = policy.m_upper[1];
+
+    ValueType result = ValueType();
+
+    // FIXME_OPENMPTARGET: Unable to separate directives and their companion
+    // loops which leads to code duplication for different reduction types.
+    if constexpr (UseReducer) {
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp target teams distribute parallel for collapse(2) map(to         \
+                                                                 : functor) \
+    reduction(custom                                                        \
+              : result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+            functor(i0, i1, result);
+          else
+            functor(typename Policy::work_tag(), i0, i1, result);
+        }
+      }
+    } else {
+#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \
+reduction(+:result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+            functor(i0, i1, result);
+          else
+            functor(typename Policy::work_tag(), i0, i1, result);
+        }
+      }
+    }
+
+    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
+                                   m_result_ptr_on_device);
   }
-  // TODO DZP: based on a conversation with Christian, we're using 256 as a
-heuristic
-  // here. We need something better once we can query these kinds of properties
-  template<typename Policy, typename Functor>
-static int max_tile_size_product(const Policy&, const Functor&) {
+
+  template <int Rank, class ValueType>
+  inline typename std::enable_if<Rank == 3>::type execute_tile(
+      const FunctorType& functor, const Policy& policy,
+      pointer_type ptr) const {
+    const auto begin_0 = policy.m_lower[0];
+    const auto begin_1 = policy.m_lower[1];
+    const auto begin_2 = policy.m_lower[2];
+
+    const auto end_0 = policy.m_upper[0];
+    const auto end_1 = policy.m_upper[1];
+    const auto end_2 = policy.m_upper[2];
+
+    ValueType result = ValueType();
+
+    // FIXME_OPENMPTARGET: Unable to separate directives and their companion
+    // loops which leads to code duplication for different reduction types.
+    if constexpr (UseReducer) {
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp target teams distribute parallel for collapse(3) map(to         \
+                                                                 : functor) \
+    reduction(custom                                                        \
+              : result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+              functor(i0, i1, i2, result);
+            else
+              functor(typename Policy::work_tag(), i0, i1, i2, result);
+          }
+        }
+      }
+    } else {
+#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \
+reduction(+:result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+              functor(i0, i1, i2, result);
+            else
+              functor(typename Policy::work_tag(), i0, i1, i2, result);
+          }
+        }
+      }
+    }
+
+    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
+                                   m_result_ptr_on_device);
+  }
+
+  template <int Rank, class ValueType>
+  inline typename std::enable_if<Rank == 4>::type execute_tile(
+      const FunctorType& functor, const Policy& policy,
+      pointer_type ptr) const {
+    const auto begin_0 = policy.m_lower[0];
+    const auto begin_1 = policy.m_lower[1];
+    const auto begin_2 = policy.m_lower[3];
+    const auto begin_3 = policy.m_lower[2];
+
+    const auto end_0 = policy.m_upper[0];
+    const auto end_1 = policy.m_upper[1];
+    const auto end_2 = policy.m_upper[2];
+    const auto end_3 = policy.m_upper[3];
+
+    ValueType result = ValueType();
+
+    // FIXME_OPENMPTARGET: Unable to separate directives and their companion
+    // loops which leads to code duplication for different reduction types.
+    if constexpr (UseReducer) {
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp target teams distribute parallel for collapse(4) map(to         \
+                                                                 : functor) \
+    reduction(custom                                                        \
+              : result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            for (auto i3 = begin_3; i3 < end_3; ++i3) {
+              if constexpr (std::is_same<typename Policy::work_tag,
+                                         void>::value)
+                functor(i0, i1, i2, i3, result);
+              else
+                functor(typename Policy::work_tag(), i0, i1, i2, i3, result);
+            }
+          }
+        }
+      }
+    } else {
+#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \
+reduction(+:result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            for (auto i3 = begin_3; i3 < end_3; ++i3) {
+              if constexpr (std::is_same<typename Policy::work_tag,
+                                         void>::value)
+                functor(i0, i1, i2, i3, result);
+              else
+                functor(typename Policy::work_tag(), i0, i1, i2, i3, result);
+            }
+          }
+        }
+      }
+    }
+
+    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
+                                   m_result_ptr_on_device);
+  }
+
+  template <int Rank, class ValueType>
+  inline typename std::enable_if<Rank == 5>::type execute_tile(
+      const FunctorType& functor, const Policy& policy,
+      pointer_type ptr) const {
+    const auto begin_0 = policy.m_lower[0];
+    const auto begin_1 = policy.m_lower[1];
+    const auto begin_2 = policy.m_lower[2];
+    const auto begin_3 = policy.m_lower[3];
+    const auto begin_4 = policy.m_lower[4];
+
+    const auto end_0 = policy.m_upper[0];
+    const auto end_1 = policy.m_upper[1];
+    const auto end_2 = policy.m_upper[2];
+    const auto end_3 = policy.m_upper[3];
+    const auto end_4 = policy.m_upper[4];
+
+    ValueType result = ValueType();
+
+    // FIXME_OPENMPTARGET: Unable to separate directives and their companion
+    // loops which leads to code duplication for different reduction types.
+    if constexpr (UseReducer) {
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp target teams distribute parallel for collapse(5) map(to         \
+                                                                 : functor) \
+    reduction(custom                                                        \
+              : result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            for (auto i3 = begin_3; i3 < end_3; ++i3) {
+              for (auto i4 = begin_4; i4 < end_4; ++i4) {
+                if constexpr (std::is_same<typename Policy::work_tag,
+                                           void>::value)
+                  functor(i0, i1, i2, i3, i4, result);
+                else
+                  functor(typename Policy::work_tag(), i0, i1, i2, i3, i4,
+                          result);
+              }
+            }
+          }
+        }
+      }
+    } else {
+#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \
+reduction(+:result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            for (auto i3 = begin_3; i3 < end_3; ++i3) {
+              for (auto i4 = begin_4; i4 < end_4; ++i4) {
+                if constexpr (std::is_same<typename Policy::work_tag,
+                                           void>::value)
+                  functor(i0, i1, i2, i3, i4, result);
+                else
+                  functor(typename Policy::work_tag(), i0, i1, i2, i3, i4,
+                          result);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
+                                   m_result_ptr_on_device);
+  }
+
+  template <int Rank, class ValueType>
+  inline typename std::enable_if<Rank == 6>::type execute_tile(
+      const FunctorType& functor, const Policy& policy,
+      pointer_type ptr) const {
+    const auto begin_0 = policy.m_lower[0];
+    const auto begin_1 = policy.m_lower[1];
+    const auto begin_2 = policy.m_lower[2];
+    const auto begin_3 = policy.m_lower[3];
+    const auto begin_4 = policy.m_lower[4];
+    const auto begin_5 = policy.m_lower[5];
+
+    const auto end_0 = policy.m_upper[0];
+    const auto end_1 = policy.m_upper[1];
+    const auto end_2 = policy.m_upper[2];
+    const auto end_3 = policy.m_upper[3];
+    const auto end_4 = policy.m_upper[4];
+    const auto end_5 = policy.m_upper[5];
+
+    ValueType result = ValueType();
+
+    // FIXME_OPENMPTARGET: Unable to separate directives and their companion
+    // loops which leads to code duplication for different reduction types.
+    if constexpr (UseReducer) {
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp target teams distribute parallel for collapse(6) map(to         \
+                                                                 : functor) \
+    reduction(custom                                                        \
+              : result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            for (auto i3 = begin_3; i3 < end_3; ++i3) {
+              for (auto i4 = begin_4; i4 < end_4; ++i4) {
+                for (auto i5 = begin_5; i5 < end_5; ++i5) {
+                  if constexpr (std::is_same<typename Policy::work_tag,
+                                             void>::value)
+                    functor(i0, i1, i2, i3, i4, i5, result);
+                  else
+                    functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5,
+                            result);
+                }
+              }
+            }
+          }
+        }
+      }
+    } else {
+#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \
+reduction(+:result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            for (auto i3 = begin_3; i3 < end_3; ++i3) {
+              for (auto i4 = begin_4; i4 < end_4; ++i4) {
+                for (auto i5 = begin_5; i5 < end_5; ++i5) {
+                  if constexpr (std::is_same<typename Policy::work_tag,
+                                             void>::value)
+                    functor(i0, i1, i2, i3, i4, i5, result);
+                  else
+                    functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5,
+                            result);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
+                                   m_result_ptr_on_device);
+  }
+
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy&, const Functor&) {
     return 256;
   }
-};*/
+};
 
 }  // namespace Impl
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
index be924ffa61c1f8cf696b3b84cb44765536fde4f9..0e71a239caf343d77f6ed05ff02bb2e45ca64efd 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
@@ -112,35 +112,11 @@ void TaskExec<Kokkos::Experimental::OpenMPTarget>::team_barrier_impl() const {
   // This team member sets one byte within the sync variable
   int8_t volatile *const sync_self = ((int8_t *)sync) + m_team_rank;
 
-#if 0
-fprintf( stdout
-       , "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n"
-       , m_group_rank
-       , m_team_rank
-       , m_sync_step
-       , m_sync_value
-       , *sync
-       );
-fflush(stdout);
-#endif
-
   *sync_self = int8_t(m_sync_value & 0x03);  // signal arrival
 
   while (m_sync_value != *sync)
     ;  // wait for team to arrive
 
-#if 0
-fprintf( stdout
-       , "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n"
-       , m_group_rank
-       , m_team_rank
-       , m_sync_step
-       , m_sync_value
-       , *sync
-       );
-fflush(stdout);
-#endif
-
   ++m_sync_step;
 
   if (0 == (0x01 & m_sync_step)) {  // Every other step
@@ -222,17 +198,6 @@ void TaskQueueSpecialization<Kokkos::Experimental::OpenMPTarget>::execute(
         task = *task_shared;
       }
 
-#if 0
-fprintf( stdout
-       , "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n"
-       , team_exec.m_group_rank
-       , team_exec.m_team_rank
-       , uintptr_t(task_shared)
-       , uintptr_t(task)
-       );
-fflush(stdout);
-#endif
-
       if (0 == task) break;  // 0 == m_ready_count
 
       if (end == task) {
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
index 3a09ee919540b93c74bcd5f2e7eea57b352575a7..18d33317a29819274037085b6a69e9239797f1a8 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
@@ -112,14 +112,36 @@ void SYCL::print_configuration(std::ostream& s, const bool detailed) {
 }
 
 void SYCL::fence() const {
-  Impl::SYCLInternal::fence(*m_space_instance->m_queue);
+  fence("Kokkos::Experimental::SYCL::fence: Unnamed Instance Fence");
+}
+void SYCL::fence(const std::string& name) const {
+  Impl::SYCLInternal::fence(*m_space_instance->m_queue, name,
+                            impl_instance_id());
 }
 
 void SYCL::impl_static_fence() {
-  // guard accessing all_queues
-  std::lock_guard<std::mutex> lock(Impl::SYCLInternal::mutex);
-  for (auto& queue : Impl::SYCLInternal::all_queues)
-    Impl::SYCLInternal::fence(**queue);
+  impl_static_fence(
+      "Kokkos::Experimental::SYCL::fence: Unnamed Instance Fence");
+}
+void SYCL::impl_static_fence(const std::string& name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::SYCL>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      [&]() {
+        // guard accessing all_queues
+        std::lock_guard<std::mutex> lock(Impl::SYCLInternal::mutex);
+        for (auto& queue : Impl::SYCLInternal::all_queues) {
+          try {
+            (*queue)->wait_and_throw();
+          } catch (sycl::exception const& e) {
+            Kokkos::Impl::throw_runtime_exception(
+                std::string("There was a synchronous SYCL error:\n") +=
+                e.what());
+          }
+        }
+      });
 }
 
 int SYCL::sycl_device() const {
@@ -224,10 +246,6 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os,
             << device.get_info<device::global_mem_cache_size>()
             << "\nGlobal Mem Size: "
             << device.get_info<device::global_mem_size>()
-            << "\nMax Constant Buffer Size: "
-            << device.get_info<device::max_constant_buffer_size>()
-            << "\nMax Constant Args: "
-            << device.get_info<device::max_constant_args>()
             << "\nLocal Mem Size: " << device.get_info<device::local_mem_size>()
             << "\nError Correction Support: "
             << device.get_info<device::error_correction_support>()
@@ -296,6 +314,9 @@ void SYCLSpaceInitializer::finalize(const bool all_spaces) {
 void SYCLSpaceInitializer::fence() {
   Kokkos::Experimental::SYCL::impl_static_fence();
 }
+void SYCLSpaceInitializer::fence(const std::string& name) {
+  Kokkos::Experimental::SYCL::impl_static_fence(name);
+}
 
 void SYCLSpaceInitializer::print_configuration(std::ostream& msg,
                                                const bool detail) {
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp
index aef65ee7ecbbf3c39432b42a42b595dbfe00b239..3eeab5636342031955920c81e807b218d662f3b8 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp
@@ -48,181 +48,144 @@
 #include <Kokkos_Core_fwd.hpp>
 #include <Kokkos_SYCL.hpp>
 
+#include <vector>
+
 #ifdef KOKKOS_ENABLE_SYCL
 
 namespace Kokkos {
 namespace Impl {
 
-template <>
-struct DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                Kokkos::Experimental::SYCLDeviceUSMSpace,
-                Kokkos::Experimental::SYCL> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::SYCL&, void* dst, const void* src,
-           size_t);
-};
-
-template <>
-struct DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-                Kokkos::Experimental::SYCL> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::SYCL&, void* dst, const void* src,
-           size_t);
-};
-
-template <>
-struct DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
-                Kokkos::Experimental::SYCL> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::SYCL&, void* dst, const void* src,
-           size_t);
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                Kokkos::Experimental::SYCLDeviceUSMSpace, ExecutionSpace> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                   Kokkos::Experimental::SYCLDeviceUSMSpace,
-                   Kokkos::Experimental::SYCL>(dst, src, n);
+template <class DT, class... DP>
+struct ZeroMemset<Kokkos::Experimental::SYCL, DT, DP...> {
+  ZeroMemset(const Kokkos::Experimental::SYCL& exec_space,
+             const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    auto event = exec_space.impl_internal_space_instance()->m_queue->memset(
+        dst.data(), 0,
+        dst.size() * sizeof(typename View<DT, DP...>::value_type));
+    exec_space.impl_internal_space_instance()->m_queue->submit_barrier(
+        std::vector<sycl::event>{event});
   }
 
-  DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
-    DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-             Kokkos::Experimental::SYCLDeviceUSMSpace,
-             Kokkos::Experimental::SYCL>(Kokkos::Experimental::SYCL(), dst, src,
-                                         n);
-    Kokkos::Experimental::SYCL().fence();
+  ZeroMemset(const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    Experimental::Impl::SYCLInternal::singleton().m_queue->memset(
+        dst.data(), 0,
+        dst.size() * sizeof(typename View<DT, DP...>::value_type));
   }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-                ExecutionSpace> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-                   Kokkos::Experimental::SYCL>(dst, src, n);
-  }
+void DeepCopySYCL(void* dst, const void* src, size_t n);
+void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst,
+                       const void* src, size_t n);
+void DeepCopyAsyncSYCL(void* dst, const void* src, size_t n);
 
-  DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
-    DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-             Kokkos::Experimental::SYCL>(Kokkos::Experimental::SYCL(), dst, src,
-                                         n);
-    Kokkos::Experimental::SYCL().fence();
+template <class MemSpace>
+struct DeepCopy<MemSpace, HostSpace, Kokkos::Experimental::SYCL,
+                std::enable_if_t<is_sycl_type_space<MemSpace>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); }
+  DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst,
+           const void* src, size_t n) {
+    DeepCopyAsyncSYCL(instance, dst, src, n);
   }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
-                ExecutionSpace> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
-                   Kokkos::Experimental::SYCL>(dst, src, n);
-  }
-
-  DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
-    DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
-             Kokkos::Experimental::SYCL>(Kokkos::Experimental::SYCL(), dst, src,
-                                         n);
-    Kokkos::Experimental::SYCL().fence();
+template <class MemSpace>
+struct DeepCopy<HostSpace, MemSpace, Kokkos::Experimental::SYCL,
+                std::enable_if_t<is_sycl_type_space<MemSpace>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); }
+  DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst,
+           const void* src, size_t n) {
+    DeepCopyAsyncSYCL(instance, dst, src, n);
   }
 };
 
-template <>
-struct DeepCopy<Experimental::SYCLSharedUSMSpace,
-                Experimental::SYCLSharedUSMSpace, Kokkos::Experimental::SYCL>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                      Experimental::SYCLDeviceUSMSpace,
-                      Kokkos::Experimental::SYCL> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                 Experimental::SYCLDeviceUSMSpace,
-                 Kokkos::Experimental::SYCL>::DeepCopy;
+template <class MemSpace1, class MemSpace2>
+struct DeepCopy<MemSpace1, MemSpace2, Kokkos::Experimental::SYCL,
+                std::enable_if_t<is_sycl_type_space<MemSpace1>::value &&
+                                 is_sycl_type_space<MemSpace2>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); }
+  DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst,
+           const void* src, size_t n) {
+    DeepCopyAsyncSYCL(instance, dst, src, n);
+  }
 };
 
-template <>
-struct DeepCopy<Experimental::SYCLSharedUSMSpace, HostSpace,
-                Kokkos::Experimental::SYCL>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace, HostSpace,
-                      Kokkos::Experimental::SYCL> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace, HostSpace,
-                 Kokkos::Experimental::SYCL>::DeepCopy;
-};
+template <class MemSpace1, class MemSpace2, class ExecutionSpace>
+struct DeepCopy<
+    MemSpace1, MemSpace2, ExecutionSpace,
+    std::enable_if_t<
+        is_sycl_type_space<MemSpace1>::value &&
+        is_sycl_type_space<MemSpace2>::value &&
+        !std::is_same<ExecutionSpace, Kokkos::Experimental::SYCL>::value>> {
+  inline DeepCopy(void* dst, const void* src, size_t n) {
+    DeepCopySYCL(dst, src, n);
+  }
 
-template <>
-struct DeepCopy<HostSpace, Experimental::SYCLSharedUSMSpace,
-                Kokkos::Experimental::SYCL>
-    : public DeepCopy<HostSpace, Experimental::SYCLDeviceUSMSpace,
-                      Kokkos::Experimental::SYCL> {
-  using DeepCopy<HostSpace, Experimental::SYCLDeviceUSMSpace,
-                 Kokkos::Experimental::SYCL>::DeepCopy;
-};
+  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
+                  size_t n) {
+    exec.fence(fence_string());
+    DeepCopyAsyncSYCL(dst, src, n);
+  }
 
-template <>
-struct DeepCopy<Experimental::SYCLSharedUSMSpace,
-                Experimental::SYCLDeviceUSMSpace, Kokkos::Experimental::SYCL>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                      Experimental::SYCLDeviceUSMSpace,
-                      Kokkos::Experimental::SYCL> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                 Experimental::SYCLDeviceUSMSpace,
-                 Kokkos::Experimental::SYCL>::DeepCopy;
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<") + MemSpace1::name() + "Space, " +
+        MemSpace2::name() +
+        "Space, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
+  }
 };
 
-template <>
-struct DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                Experimental::SYCLSharedUSMSpace, Kokkos::Experimental::SYCL>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                      Experimental::SYCLDeviceUSMSpace,
-                      Kokkos::Experimental::SYCL> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                 Experimental::SYCLDeviceUSMSpace,
-                 Kokkos::Experimental::SYCL>::DeepCopy;
-};
+template <class MemSpace, class ExecutionSpace>
+struct DeepCopy<
+    MemSpace, HostSpace, ExecutionSpace,
+    std::enable_if_t<
+        is_sycl_type_space<MemSpace>::value &&
+        !std::is_same<ExecutionSpace, Kokkos::Experimental::SYCL>::value>> {
+  inline DeepCopy(void* dst, const void* src, size_t n) {
+    DeepCopySYCL(dst, src, n);
+  }
 
-template <class ExecutionSpace>
-struct DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                Experimental::SYCLSharedUSMSpace, ExecutionSpace>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                      Experimental::SYCLDeviceUSMSpace, ExecutionSpace> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                 Experimental::SYCLDeviceUSMSpace, ExecutionSpace>::DeepCopy;
-};
+  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
+                  size_t n) {
+    exec.fence(fence_string());
+    DeepCopyAsyncSYCL(dst, src, n);
+  }
 
-template <class ExecutionSpace>
-struct DeepCopy<Experimental::SYCLSharedUSMSpace,
-                Experimental::SYCLDeviceUSMSpace, ExecutionSpace>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                      Experimental::SYCLDeviceUSMSpace, ExecutionSpace> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                 Experimental::SYCLDeviceUSMSpace, ExecutionSpace>::DeepCopy;
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<") + MemSpace::name() +
+        "Space, HostSpace, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
+  }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<Experimental::SYCLSharedUSMSpace,
-                Experimental::SYCLSharedUSMSpace, ExecutionSpace>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                      Experimental::SYCLDeviceUSMSpace, ExecutionSpace> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                 Experimental::SYCLDeviceUSMSpace, ExecutionSpace>::DeepCopy;
-};
+template <class MemSpace, class ExecutionSpace>
+struct DeepCopy<
+    HostSpace, MemSpace, ExecutionSpace,
+    std::enable_if_t<
+        is_sycl_type_space<MemSpace>::value &&
+        !std::is_same<ExecutionSpace, Kokkos::Experimental::SYCL>::value>> {
+  inline DeepCopy(void* dst, const void* src, size_t n) {
+    DeepCopySYCL(dst, src, n);
+  }
 
-template <class ExecutionSpace>
-struct DeepCopy<Experimental::SYCLSharedUSMSpace, HostSpace, ExecutionSpace>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace, HostSpace,
-                      ExecutionSpace> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace, HostSpace,
-                 ExecutionSpace>::DeepCopy;
-};
+  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
+                  size_t n) {
+    exec.fence(fence_string());
+    DeepCopyAsyncSYCL(dst, src, n);
+  }
 
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, Experimental::SYCLSharedUSMSpace, ExecutionSpace>
-    : public DeepCopy<HostSpace, Experimental::SYCLDeviceUSMSpace,
-                      ExecutionSpace> {
-  using DeepCopy<HostSpace, Experimental::SYCLDeviceUSMSpace,
-                 ExecutionSpace>::DeepCopy;
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<HostSpace, ") + MemSpace::name() +
+        "Space, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
+  }
 };
 
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp
index 5a702b5027277cc7137cba9bba72e7367e9ae97b..816b42038ed0bb1605d005375bd39d2de4e3d69d 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp
@@ -42,14 +42,7 @@
 //@HEADER
 */
 
-#include <Kokkos_Concepts.hpp>
-#include <SYCL/Kokkos_SYCL_Instance.hpp>
-#include <KokkosCore_Config_DeclareBackend.hpp>
-#include <Kokkos_SYCL.hpp>
-#include <Kokkos_HostSpace.hpp>
-#include <Kokkos_Serial.hpp>
-#include <impl/Kokkos_ConcurrentBitset.hpp>
-#include <impl/Kokkos_Error.hpp>
+#include <Kokkos_Core.hpp>  //kokkos_malloc
 
 namespace Kokkos {
 namespace Experimental {
@@ -122,7 +115,6 @@ void SYCLInternal::initialize(const sycl::queue& q) {
       all_queues.push_back(&m_queue);
     }
     const sycl::device& d = m_queue->get_device();
-    std::cout << SYCL::SYCLDevice(d) << '\n';
 
     m_maxWorkgroupSize =
         d.template get_info<sycl::info::device::max_work_group_size>();
@@ -140,19 +132,22 @@ void SYCLInternal::initialize(const sycl::queue& q) {
           Kokkos::Experimental::SYCLDeviceUSMSpace, void>;
       Record* const r =
           Record::allocate(Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue),
-                           "Kokkos::SYCL::InternalScratchBitset",
+                           "Kokkos::Experimental::SYCL::InternalScratchBitset",
                            sizeof(uint32_t) * buffer_bound);
       Record::increment(r);
       m_scratchConcurrentBitset = reinterpret_cast<uint32_t*>(r->data());
       auto event                = m_queue->memset(m_scratchConcurrentBitset, 0,
                                    sizeof(uint32_t) * buffer_bound);
-      fence(event);
+      fence(event,
+            "Kokkos::Experimental::SYCLInternal::initialize: fence after "
+            "initializing m_scratchConcurrentBitset",
+            m_instance_id);
     }
 
     m_maxShmemPerBlock =
         d.template get_info<sycl::info::device::local_mem_size>();
-    m_indirectKernelMem.reset(*m_queue);
-    m_indirectReducerMem.reset(*m_queue);
+    m_indirectKernelMem.reset(*m_queue, m_instance_id);
+    m_indirectReducerMem.reset(*m_queue, m_instance_id);
   } else {
     std::ostringstream msg;
     msg << "Kokkos::Experimental::SYCL::initialize(...) FAILED";
@@ -162,10 +157,36 @@ void SYCLInternal::initialize(const sycl::queue& q) {
     }
     Kokkos::Impl::throw_runtime_exception(msg.str());
   }
+
+  m_team_scratch_current_size = 0;
+  m_team_scratch_ptr          = nullptr;
+}
+
+void* SYCLInternal::resize_team_scratch_space(std::int64_t bytes,
+                                              bool force_shrink) {
+  if (m_team_scratch_current_size == 0) {
+    m_team_scratch_current_size = bytes;
+    m_team_scratch_ptr =
+        Kokkos::kokkos_malloc<Experimental::SYCLDeviceUSMSpace>(
+            "Kokkos::Experimental::SYCLDeviceUSMSpace::TeamScratchMemory",
+            m_team_scratch_current_size);
+  }
+  if ((bytes > m_team_scratch_current_size) ||
+      ((bytes < m_team_scratch_current_size) && (force_shrink))) {
+    m_team_scratch_current_size = bytes;
+    m_team_scratch_ptr =
+        Kokkos::kokkos_realloc<Experimental::SYCLDeviceUSMSpace>(
+            m_team_scratch_ptr, m_team_scratch_current_size);
+  }
+  return m_team_scratch_ptr;
 }
 
+uint32_t SYCLInternal::impl_get_instance_id() const { return m_instance_id; }
+
 void SYCLInternal::finalize() {
-  SYCL().fence();
+  SYCLInternal::fence(*m_queue,
+                      "Kokkos::SYCLInternal::finalize: fence on finalization",
+                      m_instance_id);
   was_finalized = true;
 
   using RecordSYCL = Kokkos::Impl::SharedAllocationRecord<SYCLDeviceUSMSpace>;
@@ -182,6 +203,12 @@ void SYCLInternal::finalize() {
   RecordSYCL::decrement(RecordSYCL::get_record(m_scratchConcurrentBitset));
   m_scratchConcurrentBitset = nullptr;
 
+  if (m_team_scratch_current_size > 0)
+    Kokkos::kokkos_free<Kokkos::Experimental::SYCLDeviceUSMSpace>(
+        m_team_scratch_ptr);
+  m_team_scratch_current_size = 0;
+  m_team_scratch_ptr          = nullptr;
+
   m_indirectKernelMem.reset();
   m_indirectReducerMem.reset();
   // guard erasing from all_queues
@@ -208,7 +235,7 @@ void* SYCLInternal::scratch_space(
 
     Record* const r =
         Record::allocate(Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue),
-                         "Kokkos::SYCL::InternalScratchSpace",
+                         "Kokkos::Experimental::SYCL::InternalScratchSpace",
                          (sizeScratchGrain * m_scratchSpaceCount));
 
     Record::increment(r);
@@ -235,7 +262,7 @@ void* SYCLInternal::scratch_flags(
 
     Record* const r =
         Record::allocate(Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue),
-                         "Kokkos::SYCL::InternalScratchFlags",
+                         "Kokkos::Experimental::SYCL::InternalScratchFlags",
                          (sizeScratchGrain * m_scratchFlagsCount));
 
     Record::increment(r);
@@ -243,14 +270,38 @@ void* SYCLInternal::scratch_flags(
     m_scratchFlags = reinterpret_cast<size_type*>(r->data());
   }
   m_queue->memset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain);
-  fence(*m_queue);
+  fence(*m_queue,
+        "Kokkos::Experimental::SYCLInternal::scratch_flags fence after "
+        "initializing m_scratchFlags",
+        m_instance_id);
 
   return m_scratchFlags;
 }
 
+template <typename WAT>
+void SYCLInternal::fence_helper(WAT& wat, const std::string& name,
+                                uint32_t instance_id) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::SYCL>(
+      name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{instance_id},
+      [&]() {
+        try {
+          wat.wait_and_throw();
+        } catch (sycl::exception const& e) {
+          Kokkos::Impl::throw_runtime_exception(
+              std::string("There was a synchronous SYCL error:\n") += e.what());
+        }
+      });
+}
+template void SYCLInternal::fence_helper<sycl::queue>(sycl::queue&,
+                                                      const std::string&,
+                                                      uint32_t);
+template void SYCLInternal::fence_helper<sycl::event>(sycl::event&,
+                                                      const std::string&,
+                                                      uint32_t);
+
 template <sycl::usm::alloc Kind>
 size_t SYCLInternal::USMObjectMem<Kind>::reserve(size_t n) {
-  assert(m_size == 0);
   assert(m_q);
 
   if (m_capacity < n) {
@@ -258,8 +309,8 @@ size_t SYCLInternal::USMObjectMem<Kind>::reserve(size_t n) {
     // First free what we have (in case malloc can reuse it)
     if (m_data) Record::decrement(Record::get_record(m_data));
 
-    Record* const r = Record::allocate(AllocationSpace(*m_q),
-                                       "Kokkos::SYCL::USMObjectMem", n);
+    Record* const r = Record::allocate(
+        AllocationSpace(*m_q), "Kokkos::Experimental::SYCL::USMObjectMem", n);
     Record::increment(r);
 
     m_data     = r->data();
@@ -271,9 +322,9 @@ size_t SYCLInternal::USMObjectMem<Kind>::reserve(size_t n) {
 
 template <sycl::usm::alloc Kind>
 void SYCLInternal::USMObjectMem<Kind>::reset() {
-  assert(m_size == 0);
-
   if (m_data) {
+    // This implies a fence since this class is not copyable
+    // and deallocating implies a fence across all registered queues.
     using Record = Kokkos::Impl::SharedAllocationRecord<AllocationSpace, void>;
     Record::decrement(Record::get_record(m_data));
 
@@ -285,6 +336,7 @@ void SYCLInternal::USMObjectMem<Kind>::reset() {
 
 template class SYCLInternal::USMObjectMem<sycl::usm::alloc::shared>;
 template class SYCLInternal::USMObjectMem<sycl::usm::alloc::device>;
+template class SYCLInternal::USMObjectMem<sycl::usm::alloc::host>;
 
 }  // namespace Impl
 }  // namespace Experimental
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp
index e797411cd40bdd734c04d2a9b0e51151fa269ebd..bf4d6c5b459579213866f6dcb99332c6e641c3a1 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp
@@ -49,7 +49,7 @@
 #include <CL/sycl.hpp>
 
 #include <impl/Kokkos_Error.hpp>
-
+#include <impl/Kokkos_Profiling.hpp>
 namespace Kokkos {
 namespace Experimental {
 namespace Impl {
@@ -68,7 +68,10 @@ class SYCLInternal {
 
   void* scratch_space(const size_type size);
   void* scratch_flags(const size_type size);
+  void* resize_team_scratch_space(std::int64_t bytes,
+                                  bool force_shrink = false);
 
+  uint32_t impl_get_instance_id() const;
   int m_syclDev = -1;
 
   size_t m_maxWorkgroupSize   = 0;
@@ -81,6 +84,11 @@ class SYCLInternal {
   size_type m_scratchFlagsCount       = 0;
   size_type* m_scratchFlags           = nullptr;
 
+  int64_t m_team_scratch_current_size = 0;
+  void* m_team_scratch_ptr            = nullptr;
+
+  uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance<
+      Kokkos::Experimental::SYCL>(reinterpret_cast<uintptr_t>(this));
   std::optional<sycl::queue> m_queue;
 
   // Using std::vector<std::optional<sycl::queue>> reveals a compiler bug when
@@ -94,40 +102,16 @@ class SYCLInternal {
   template <sycl::usm::alloc Kind>
   class USMObjectMem {
    public:
-    class Deleter {
-     public:
-      Deleter() = default;
-      explicit Deleter(USMObjectMem* mem) : m_mem(mem) {}
-
-      template <typename T>
-      void operator()(T* p) const noexcept {
-        assert(m_mem);
-        assert(sizeof(T) == m_mem->size());
-
-        if constexpr (sycl::usm::alloc::device == kind)
-          // Only skipping the dtor on trivially copyable types
-          static_assert(std::is_trivially_copyable_v<T>);
-        else
-          p->~T();
-
-        m_mem->m_size = 0;
-      }
-
-     private:
-      USMObjectMem* m_mem = nullptr;
-    };
-
-    static constexpr sycl::usm::alloc kind = Kind;
-
     void reset();
 
-    void reset(sycl::queue q) {
+    void reset(sycl::queue q, uint32_t instance_id) {
+      m_instance_id = instance_id;
       reset();
       m_q.emplace(std::move(q));
     }
-
     USMObjectMem() = default;
-    explicit USMObjectMem(sycl::queue q) noexcept : m_q(std::move(q)) {}
+    explicit USMObjectMem(sycl::queue q, uint32_t instance_id) noexcept
+        : m_q(std::move(q)), m_instance_id(instance_id) {}
 
     USMObjectMem(USMObjectMem const&) = delete;
     USMObjectMem(USMObjectMem&&)      = delete;
@@ -139,7 +123,6 @@ class SYCLInternal {
     void* data() noexcept { return m_data; }
     const void* data() const noexcept { return m_data; }
 
-    size_t size() const noexcept { return m_size; }
     size_t capacity() const noexcept { return m_capacity; }
 
     // reserve() allocates space for at least n bytes
@@ -147,120 +130,68 @@ class SYCLInternal {
     size_t reserve(size_t n);
 
    private:
-    using AllocationSpace =
-        std::conditional_t<Kind == sycl::usm::alloc::device,
-                           Kokkos::Experimental::SYCLDeviceUSMSpace,
-                           Kokkos::Experimental::SYCLSharedUSMSpace>;
-
-    // This will memcpy an object T into memory held by this object
-    // returns: a T* to that object
-    //
-    // Note:  it is UB to dereference this pointer with an object that is
-    // not an implicit-lifetime nor trivially-copyable type, but presumably much
-    // faster because we can use USM device memory
-    template <typename T>
-    std::unique_ptr<T, Deleter> memcpy_from(const T& t) {
-      reserve(sizeof(T));
-      sycl::event memcopied = m_q->memcpy(m_data, std::addressof(t), sizeof(T));
-      fence(memcopied);
-
-      std::unique_ptr<T, Deleter> ptr(reinterpret_cast<T*>(m_data),
-                                      Deleter(this));
-      m_size = sizeof(T);
-      return ptr;
-    }
-
-    // This will copy-constuct an object T into memory held by this object
-    // returns: a unique_ptr<T, destruct_delete> that will call the
-    // destructor on the type when it goes out of scope.
-    //
-    // Note:  This will not work with USM device memory
-    template <typename T>
-    std::unique_ptr<T, Deleter> copy_construct_from(const T& t) {
-      static_assert(kind != sycl::usm::alloc::device,
-                    "Cannot copy construct into USM device memory");
-
-      reserve(sizeof(T));
-
-      std::unique_ptr<T, Deleter> ptr(new (m_data) T(t), Deleter(this));
-      m_size = sizeof(T);
-      return ptr;
-    }
+    using AllocationSpace = std::conditional_t<
+        Kind == sycl::usm::alloc::device,
+        Kokkos::Experimental::SYCLDeviceUSMSpace,
+        std::conditional_t<Kind == sycl::usm::alloc::shared,
+                           Kokkos::Experimental::SYCLSharedUSMSpace,
+                           Kokkos::Experimental::SYCLHostUSMSpace>>;
 
    public:
-    // Performs either memcpy (for USM device memory) and returns a T*
-    // (but is technically UB when dereferenced on an object that is not
-    // an implicit-lifetime nor trivially-copyable type
-    //
-    // or
-    //
-    // performs copy construction (for other USM memory types) and returns a
-    // unique_ptr<T, ...>
-    template <typename T>
-    std::unique_ptr<T, Deleter> copy_from(const T& t) {
-      if constexpr (sycl::usm::alloc::device == kind)
-        return memcpy_from(t);
-      else
-        return copy_construct_from(t);
-    }
-
-   private:
-    // Returns a reference to t (helpful when debugging)
+    // Performs either sycl::memcpy (for USM device memory) or std::memcpy
+    // (otherwise) and returns a reference to the copied object.
     template <typename T>
-    T& memcpy_to(T& t) {
-      assert(sizeof(T) == m_size);
-
-      sycl::event memcopied = m_q->memcpy(std::addressof(t), m_data, sizeof(T));
-      fence(memcopied);
-
-      return t;
+    T& copy_from(const T& t) {
+      fence();
+      reserve(sizeof(T));
+      if constexpr (sycl::usm::alloc::device == Kind) {
+        sycl::event memcopied =
+            m_q->memcpy(m_data, std::addressof(t), sizeof(T));
+        SYCLInternal::fence(
+            memcopied,
+            "Kokkos::Experimental::SYCLInternal::USMObject fence after copy",
+            m_instance_id);
+      } else
+        std::memcpy(m_data, std::addressof(t), sizeof(T));
+      return *reinterpret_cast<T*>(m_data);
     }
 
-    // Returns a reference to t (helpful when debugging)
-    template <typename T>
-    T& move_assign_to(T& t) {
-      static_assert(kind != sycl::usm::alloc::device,
-                    "Cannot move_assign_to from USM device memory");
-
-      assert(sizeof(T) == m_size);
-
-      t = std::move(*static_cast<T*>(m_data));
-
-      return t;
+    void fence() {
+      SYCLInternal::fence(
+          m_last_event,
+          "Kokkos::Experimental::SYCLInternal::USMObject fence to wait for "
+          "last event to finish",
+          m_instance_id);
     }
 
-   public:
-    // Returns a reference to t (helpful when debugging)
-    template <typename T>
-    T& transfer_to(T& t) {
-      if constexpr (sycl::usm::alloc::device == kind)
-        return memcpy_to(t);
-      else
-        return move_assign_to(t);
+    void register_event(sycl::event event) {
+      assert(m_last_event
+                 .get_info<sycl::info::event::command_execution_status>() ==
+             sycl::info::event_command_status::complete);
+      m_last_event = event;
     }
 
    private:
     // USMObjectMem class invariants
     // All four expressions below must evaluate to true:
     //
-    //  !m_data == !m_capacity
-    //  m_q || !m_data
-    //  m_data || !m_size
-    //  m_size <= m_capacity
+    //  !m_data == (m_capacity == 0)
+    //      m_q || !m_data
     //
     //  The above invariants mean that:
-    //  if m_size != 0 then m_data != 0
-    //  if m_data != 0 then m_capacity != 0 && m_q != nullopt
-    //  if m_data == 0 then m_capacity == 0
+    //  if m_data != nullptr then m_capacity != 0 && m_q != nullopt
+    //  if m_data == nullptr then m_capacity == 0
 
     std::optional<sycl::queue> m_q;
     void* m_data      = nullptr;
-    size_t m_size     = 0;  // sizeof(T) iff m_data points to live T
     size_t m_capacity = 0;
+    sycl::event m_last_event;
+
+    uint32_t m_instance_id;
   };
 
   // An indirect kernel is one where the functor to be executed is explicitly
-  // copied to USM device memory before being executed, to get around the
+  // copied to USM memory before being executed, to get around the
   // trivially copyable limitation of SYCL.
   using IndirectKernelMem = USMObjectMem<sycl::usm::alloc::shared>;
   IndirectKernelMem m_indirectKernelMem;
@@ -286,18 +217,18 @@ class SYCLInternal {
   // fence(...) takes any type with a .wait_and_throw() method
   // (sycl::event and sycl::queue)
   template <typename WAT>
-  static void fence_helper(WAT& wat) {
-    try {
-      wat.wait_and_throw();
-    } catch (sycl::exception const& e) {
-      Kokkos::Impl::throw_runtime_exception(
-          std::string("There was a synchronous SYCL error:\n") += e.what());
-    }
-  }
+  static void fence_helper(WAT& wat, const std::string& name,
+                           uint32_t instance_id);
 
  public:
-  static void fence(sycl::queue& q) { fence_helper(q); }
-  static void fence(sycl::event& e) { fence_helper(e); }
+  static void fence(sycl::queue& q, const std::string& name,
+                    uint32_t instance_id) {
+    fence_helper(q, name, instance_id);
+  }
+  static void fence(sycl::event& e, const std::string& name,
+                    uint32_t instance_id) {
+    fence_helper(e, name, instance_id);
+  }
 };
 
 template <typename Functor, typename Storage,
@@ -312,20 +243,24 @@ class SYCLFunctionWrapper<Functor, Storage, true> {
   SYCLFunctionWrapper(const Functor& functor, Storage&) : m_functor(functor) {}
 
   const Functor& get_functor() const { return m_functor; }
+
+  static void register_event(Storage&, sycl::event){};
 };
 
 template <typename Functor, typename Storage>
 class SYCLFunctionWrapper<Functor, Storage, false> {
-  std::unique_ptr<Functor,
-                  Experimental::Impl::SYCLInternal::IndirectKernelMem::Deleter>
-      m_kernelFunctorPtr;
+  const Functor& m_kernelFunctor;
 
  public:
   SYCLFunctionWrapper(const Functor& functor, Storage& storage)
-      : m_kernelFunctorPtr(storage.copy_from(functor)) {}
+      : m_kernelFunctor(storage.copy_from(functor)) {}
 
   std::reference_wrapper<const Functor> get_functor() const {
-    return {*m_kernelFunctorPtr};
+    return {m_kernelFunctor};
+  }
+
+  static void register_event(Storage& storage, sycl::event event) {
+    storage.register_event(event);
   }
 };
 
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp
index a286169c45988339dce1b14c6d6a4ffde25dcea5..dca73683c3d1f06157affec3e8fe00feb7d36fd0 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp
@@ -47,11 +47,13 @@
 
 #include <impl/KokkosExp_IterateTileGPU.hpp>
 
-template <class FunctorType, class ExecPolicy>
-class Kokkos::Impl::ParallelFor<FunctorType, ExecPolicy,
+#include <vector>
+
+template <class FunctorType, class... Traits>
+class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
                                 Kokkos::Experimental::SYCL> {
  public:
-  using Policy = ExecPolicy;
+  using Policy = Kokkos::RangePolicy<Traits...>;
 
  private:
   using Member       = typename Policy::member_type;
@@ -62,16 +64,15 @@ class Kokkos::Impl::ParallelFor<FunctorType, ExecPolicy,
   const Policy m_policy;
 
   template <typename Functor>
-  static void sycl_direct_launch(const Policy& policy, const Functor& functor) {
+  static sycl::event sycl_direct_launch(const Policy& policy,
+                                        const Functor& functor) {
     // Convenience references
     const Kokkos::Experimental::SYCL& space = policy.space();
     Kokkos::Experimental::Impl::SYCLInternal& instance =
         *space.impl_internal_space_instance();
     sycl::queue& q = *instance.m_queue;
 
-    space.fence();
-
-    q.submit([functor, policy](sycl::handler& cgh) {
+    auto parallel_for_event = q.submit([functor, policy](sycl::handler& cgh) {
       sycl::range<1> range(policy.end() - policy.begin());
       const auto begin = policy.begin();
 
@@ -83,8 +84,9 @@ class Kokkos::Impl::ParallelFor<FunctorType, ExecPolicy,
           functor(WorkTag(), id);
       });
     });
+    q.submit_barrier(std::vector<sycl::event>{parallel_for_event});
 
-    space.fence();
+    return parallel_for_event;
   }
 
  public:
@@ -100,7 +102,9 @@ class Kokkos::Impl::ParallelFor<FunctorType, ExecPolicy,
 
     const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_functor, indirectKernelMem);
-    sycl_direct_launch(m_policy, functor_wrapper.get_functor());
+    sycl::event event =
+        sycl_direct_launch(m_policy, functor_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
   }
 
   ParallelFor(const ParallelFor&) = delete;
@@ -201,41 +205,48 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
   }
 
   template <typename Functor>
-  void sycl_direct_launch(const Functor& functor) const {
+  sycl::event sycl_direct_launch(const Functor& functor) const {
     // Convenience references
     Kokkos::Experimental::Impl::SYCLInternal& instance =
         *m_space.impl_internal_space_instance();
     sycl::queue& q = *instance.m_queue;
 
-    m_space.fence();
-
-    if (m_policy.m_num_tiles == 0) return;
+    if (m_policy.m_num_tiles == 0) return {};
 
     const BarePolicy bare_policy(m_policy);
 
-    q.submit([functor, this, bare_policy](sycl::handler& cgh) {
-      const auto range = compute_ranges();
-
-      cgh.parallel_for(range, [functor, bare_policy](sycl::nd_item<3> item) {
-        const index_type local_x    = item.get_local_id(0);
-        const index_type local_y    = item.get_local_id(1);
-        const index_type local_z    = item.get_local_id(2);
-        const index_type global_x   = item.get_group(0);
-        const index_type global_y   = item.get_group(1);
-        const index_type global_z   = item.get_group(2);
-        const index_type n_global_x = item.get_group_range(0);
-        const index_type n_global_y = item.get_group_range(1);
-        const index_type n_global_z = item.get_group_range(2);
-
-        Kokkos::Impl::DeviceIterateTile<Policy::rank, BarePolicy, Functor,
-                                        typename Policy::work_tag>(
-            bare_policy, functor, {n_global_x, n_global_y, n_global_z},
-            {global_x, global_y, global_z}, {local_x, local_y, local_z})
-            .exec_range();
-      });
-    });
-
-    m_space.fence();
+    auto parallel_for_event =
+        q.submit([functor, this, bare_policy](sycl::handler& cgh) {
+          const auto range                  = compute_ranges();
+          const sycl::range<3> global_range = range.get_global_range();
+          const sycl::range<3> local_range  = range.get_local_range();
+          const sycl::nd_range sycl_swapped_range{
+              sycl::range<3>{global_range[2], global_range[1], global_range[0]},
+              sycl::range<3>{local_range[2], local_range[1], local_range[0]}};
+
+          cgh.parallel_for(sycl_swapped_range, [functor, bare_policy](
+                                                   sycl::nd_item<3> item) {
+            // swap back for correct index calculations in DeviceIterateTile
+            const index_type local_x    = item.get_local_id(2);
+            const index_type local_y    = item.get_local_id(1);
+            const index_type local_z    = item.get_local_id(0);
+            const index_type global_x   = item.get_group(2);
+            const index_type global_y   = item.get_group(1);
+            const index_type global_z   = item.get_group(0);
+            const index_type n_global_x = item.get_group_range(2);
+            const index_type n_global_y = item.get_group_range(1);
+            const index_type n_global_z = item.get_group_range(0);
+
+            Kokkos::Impl::DeviceIterateTile<Policy::rank, BarePolicy, Functor,
+                                            typename Policy::work_tag>(
+                bare_policy, functor, {n_global_x, n_global_y, n_global_z},
+                {global_x, global_y, global_z}, {local_x, local_y, local_z})
+                .exec_range();
+          });
+        });
+    q.submit_barrier(std::vector<sycl::event>{parallel_for_event});
+
+    return parallel_for_event;
   }
 
  public:
@@ -253,7 +264,8 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 
     const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_functor, indirectKernelMem);
-    sycl_direct_launch(functor_wrapper.get_functor());
+    sycl::event event = sycl_direct_launch(functor_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
   }
 
   ParallelFor(const ParallelFor&) = delete;
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp
index 03b7753f8e81ef5045b16cedd4206d85174c0033..75237b4c72a4dbfc1b7ebe201dda240128f62ced 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp
@@ -46,14 +46,99 @@
 #define KOKKOS_SYCL_PARALLEL_REDUCE_HPP
 
 #include <Kokkos_Macros.hpp>
+
+#include <vector>
 #if defined(KOKKOS_ENABLE_SYCL)
+#include <Kokkos_Parallel_Reduce.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
+
 namespace Impl {
 
+namespace SYCLReduction {
+template <class ValueJoin, class ValueOps, typename WorkTag, typename ValueType,
+          typename ReducerType, typename FunctorType, int dim>
+void workgroup_reduction(sycl::nd_item<dim>& item,
+                         sycl::local_ptr<ValueType> local_mem,
+                         ValueType* results_ptr,
+                         ValueType* device_accessible_result_ptr,
+                         const unsigned int value_count,
+                         const ReducerType& selected_reducer,
+                         const FunctorType& functor, bool final) {
+  const auto local_id = item.get_local_linear_id();
+  // FIXME_SYCL should be item.get_group().get_local_linear_range();
+  size_t wgroup_size = 1;
+  for (unsigned int i = 0; i < dim; ++i) wgroup_size *= item.get_local_range(i);
+
+  // Perform the actual workgroup reduction in each subgroup
+  // separately.
+  auto sg                = item.get_sub_group();
+  auto* result           = &local_mem[local_id * value_count];
+  const auto id_in_sg    = sg.get_local_id()[0];
+  const auto local_range = std::min(sg.get_local_range()[0], wgroup_size);
+  for (unsigned int stride = 1; stride < local_range; stride <<= 1) {
+    if (id_in_sg + stride < local_range)
+      ValueJoin::join(selected_reducer, result,
+                      &local_mem[(local_id + stride) * value_count]);
+    sg.barrier();
+  }
+  item.barrier(sycl::access::fence_space::local_space);
+
+  // Copy the subgroup results into the first positions of the
+  // reduction array.
+  if (id_in_sg == 0)
+    ValueOps::copy(functor, &local_mem[sg.get_group_id()[0] * value_count],
+                   result);
+  item.barrier(sycl::access::fence_space::local_space);
+
+  // Do the final reduction only using the first subgroup.
+  if (sg.get_group_id()[0] == 0) {
+    const auto n_subgroups = sg.get_group_range()[0];
+    auto* result_          = &local_mem[id_in_sg * value_count];
+    // In case the number of subgroups is larger than the range of
+    // the first subgroup, we first combine the items with a higher
+    // index.
+    for (unsigned int offset = local_range; offset < n_subgroups;
+         offset += local_range)
+      if (id_in_sg + offset < n_subgroups)
+        ValueJoin::join(selected_reducer, result_,
+                        &local_mem[(id_in_sg + offset) * value_count]);
+    sg.barrier();
+
+    // Then, we proceed as before.
+    for (unsigned int stride = 1; stride < local_range; stride <<= 1) {
+      if (id_in_sg + stride < n_subgroups)
+        ValueJoin::join(selected_reducer, result_,
+                        &local_mem[(id_in_sg + stride) * value_count]);
+      sg.barrier();
+    }
+
+    // Finally, we copy the workgroup results back to global memory
+    // to be used in the next iteration. If this is the last
+    // iteration, i.e., there is only one workgroup also call
+    // final() if necessary.
+    if (id_in_sg == 0) {
+      if (final) {
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+          FunctorFinal<FunctorType, WorkTag>::final(functor, &local_mem[0]);
+        if (device_accessible_result_ptr != nullptr)
+          ValueOps::copy(functor, &device_accessible_result_ptr[0],
+                         &local_mem[0]);
+        else
+          ValueOps::copy(functor, &results_ptr[0], &local_mem[0]);
+      } else
+        ValueOps::copy(functor,
+                       &results_ptr[(item.get_group_linear_id()) * value_count],
+                       &local_mem[0]);
+    }
+  }
+}
+
+}  // namespace SYCLReduction
+
 template <class FunctorType, class ReducerType, class... Traits>
 class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                      Kokkos::Experimental::SYCL> {
@@ -76,19 +161,29 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   ParallelReduce(
       const FunctorType& f, const Policy& p, const V& v,
       typename std::enable_if<Kokkos::is_view<V>::value, void*>::type = nullptr)
-      : m_functor(f), m_policy(p), m_result_ptr(v.data()) {}
+      : m_functor(f),
+        m_policy(p),
+        m_result_ptr(v.data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename V::memory_space>::accessible) {}
 
   ParallelReduce(const FunctorType& f, const Policy& p,
                  const ReducerType& reducer)
       : m_functor(f),
         m_policy(p),
         m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {}
+        m_result_ptr(reducer.view().data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible) {}
 
  private:
   template <typename PolicyType, typename Functor, typename Reducer>
-  void sycl_direct_launch(const PolicyType& policy, const Functor& functor,
-                          const Reducer& reducer) const {
+  sycl::event sycl_direct_launch(const PolicyType& policy,
+                                 const Functor& functor,
+                                 const Reducer& reducer) const {
     using ReducerConditional =
         Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
                            FunctorType, ReducerType>;
@@ -121,18 +216,18 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     const unsigned int value_count =
         FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>::value_count(
             selected_reducer);
-    // FIXME_SYCL only use the first half
     const auto results_ptr = static_cast<pointer_type>(instance.scratch_space(
-        sizeof(value_type) * std::max(value_count, 1u) * init_size * 2));
-    // FIXME_SYCL without this we are running into a race condition
-    const auto results_ptr2 =
-        results_ptr + std::max(value_count, 1u) * init_size;
+        sizeof(value_type) * std::max(value_count, 1u) * init_size));
+    value_type* device_accessible_result_ptr =
+        m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+
+    sycl::event last_reduction_event;
 
     // If size<=1 we only call init(), the functor and possibly final once
     // working with the global scratch memory but don't copy back to
     // m_result_ptr yet.
     if (size <= 1) {
-      q.submit([&](sycl::handler& cgh) {
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         const auto begin = policy.begin();
         cgh.single_task([=]() {
           const auto& selected_reducer = ReducerConditional::select(
@@ -149,9 +244,13 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
           if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
             FunctorFinal<FunctorType, WorkTag>::final(
                 static_cast<const FunctorType&>(functor), results_ptr);
+          if (device_accessible_result_ptr != nullptr)
+            ValueOps::copy(functor, &device_accessible_result_ptr[0],
+                           &results_ptr[0]);
         });
       });
-      space.fence();
+      q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
+      last_reduction_event = parallel_reduce_event;
     }
 
     // Otherwise, we perform a reduction on the values in all workgroups
@@ -163,7 +262,7 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
       auto n_wgroups = ((size + values_per_thread - 1) / values_per_thread +
                         wgroup_size - 1) /
                        wgroup_size;
-      q.submit([&](sycl::handler& cgh) {
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         sycl::accessor<value_type, 1, sycl::access::mode::read_write,
                        sycl::access::target::local>
             local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u),
@@ -217,49 +316,15 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
               }
               item.barrier(sycl::access::fence_space::local_space);
 
-              // Perform the actual workgroup reduction. To achieve a better
-              // memory access pattern, we use sequential addressing and a
-              // reversed loop. If the workgroup size is 8, the first element
-              // contains all the values with index%4==0, after the second one
-              // the values with index%2==0 and after the third one index%1==0,
-              // i.e., all values.
-              for (unsigned int stride = wgroup_size / 2; stride > 0;
-                   stride >>= 1) {
-                const auto idx = local_id;
-                if (idx < stride) {
-                  ValueJoin::join(selected_reducer,
-                                  &local_mem[idx * value_count],
-                                  &local_mem[(idx + stride) * value_count]);
-                }
-                item.barrier(sycl::access::fence_space::local_space);
-              }
-
-              // Finally, we copy the workgroup results back to global memory to
-              // be used in the next iteration. If this is the last iteration,
-              // i.e., there is only one workgroup also call final() if
-              // necessary.
-              if (local_id == 0) {
-                ValueOps::copy(
-                    functor,
-                    &results_ptr2[(item.get_group_linear_id()) * value_count],
-                    &local_mem[0]);
-                if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
-                  if (n_wgroups <= 1)
-                    FunctorFinal<FunctorType, WorkTag>::final(
-                        static_cast<const FunctorType&>(functor),
-                        &results_ptr2[(item.get_group_linear_id()) *
-                                      value_count]);
-              }
+              SYCLReduction::workgroup_reduction<ValueJoin, ValueOps, WorkTag>(
+                  item, local_mem.get_pointer(), results_ptr,
+                  device_accessible_result_ptr, value_count, selected_reducer,
+                  static_cast<const FunctorType&>(functor), n_wgroups <= 1);
             });
       });
-      space.fence();
+      q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
 
-      // FIXME_SYCL this is likely not necessary, see above
-      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
-          space, results_ptr, results_ptr2,
-          sizeof(*m_result_ptr) * value_count * n_wgroups);
-      space.fence();
+      last_reduction_event = parallel_reduce_event;
 
       first_run = false;
       size      = n_wgroups;
@@ -268,13 +333,17 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     // At this point, the reduced value is written to the entry in results_ptr
     // and all that is left is to copy it back to the given result pointer if
     // necessary.
-    if (m_result_ptr) {
+    if (m_result_ptr && !m_result_ptr_device_accessible) {
       Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
                              Kokkos::Experimental::SYCLDeviceUSMSpace>(
           space, m_result_ptr, results_ptr,
           sizeof(*m_result_ptr) * value_count);
-      space.fence();
+      space.fence(
+          "Kokkos::Impl::ParallelReduce::sycl_direct_launch: fence due to "
+          "inaccessible reducer result location");
     }
+
+    return last_reduction_event;
   }
 
  public:
@@ -291,15 +360,18 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     const auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_reducer, indirectReducerMem);
 
-    sycl_direct_launch(m_policy, functor_wrapper.get_functor(),
-                       reducer_wrapper.get_functor());
+    sycl::event event = sycl_direct_launch(
+        m_policy, functor_wrapper.get_functor(), reducer_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
+    reducer_wrapper.register_event(indirectReducerMem, event);
   }
 
  private:
-  FunctorType m_functor;
-  Policy m_policy;
-  ReducerType m_reducer;
-  pointer_type m_result_ptr;
+  const FunctorType m_functor;
+  const Policy m_policy;
+  const ReducerType m_reducer;
+  const pointer_type m_result_ptr;
+  const bool m_result_ptr_device_accessible;
 };
 
 template <class FunctorType, class ReducerType, class... Traits>
@@ -347,7 +419,13 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   ParallelReduce(
       const FunctorType& f, const Policy& p, const V& v,
       typename std::enable_if<Kokkos::is_view<V>::value, void*>::type = nullptr)
-      : m_functor(f), m_policy(p), m_space(p.space()), m_result_ptr(v.data()) {}
+      : m_functor(f),
+        m_policy(p),
+        m_space(p.space()),
+        m_result_ptr(v.data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename V::memory_space>::accessible) {}
 
   ParallelReduce(const FunctorType& f, const Policy& p,
                  const ReducerType& reducer)
@@ -355,12 +433,17 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
         m_policy(p),
         m_space(p.space()),
         m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {}
+        m_result_ptr(reducer.view().data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible) {}
 
  private:
   template <typename PolicyType, typename Functor, typename Reducer>
-  void sycl_direct_launch(const PolicyType& policy, const Functor& functor,
-                          const Reducer& reducer) const {
+  sycl::event sycl_direct_launch(const PolicyType& policy,
+                                 const Functor& functor,
+                                 const Reducer& reducer) const {
     using ReducerConditional =
         Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
                            FunctorType, ReducerType>;
@@ -379,8 +462,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
         *m_space.impl_internal_space_instance();
     sycl::queue& q = *instance.m_queue;
 
-    const int nwork = m_policy.m_num_tiles;
-    const int block_size =
+    const typename Policy::index_type nwork = m_policy.m_num_tiles;
+    const typename Policy::index_type block_size =
         std::pow(2, std::ceil(std::log2(m_policy.m_prod_tile_dims)));
 
     const sycl::range<1> local_range(block_size);
@@ -402,12 +485,16 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     // FIXME_SYCL without this we are running into a race condition
     const auto results_ptr2 =
         results_ptr + std::max(value_count, 1u) * init_size;
+    value_type* device_accessible_result_ptr =
+        m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+
+    sycl::event last_reduction_event;
 
     // If size<=1 we only call init(), the functor and possibly final once
     // working with the global scratch memory but don't copy back to
     // m_result_ptr yet.
     if (size <= 1) {
-      q.submit([&](sycl::handler& cgh) {
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         cgh.single_task([=]() {
           const auto& selected_reducer = ReducerConditional::select(
               static_cast<const FunctorType&>(functor),
@@ -424,9 +511,13 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
           if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
             FunctorFinal<FunctorType, WorkTag>::final(
                 static_cast<const FunctorType&>(functor), results_ptr);
+          if (device_accessible_result_ptr)
+            ValueOps::copy(functor, &device_accessible_result_ptr[0],
+                           &results_ptr[0]);
         });
       });
-      m_space.fence();
+      q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
+      last_reduction_event = parallel_reduce_event;
     }
 
     // Otherwise, we perform a reduction on the values in all workgroups
@@ -435,8 +526,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     // value.
     bool first_run = true;
     while (size > 1) {
-      auto n_wgroups = (size + wgroup_size - 1) / wgroup_size;
-      q.submit([&](sycl::handler& cgh) {
+      auto n_wgroups             = (size + wgroup_size - 1) / wgroup_size;
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         sycl::accessor<value_type, 1, sycl::access::mode::read_write,
                        sycl::access::target::local>
             local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u),
@@ -498,47 +589,21 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
           }
           item.barrier(sycl::access::fence_space::local_space);
 
-          // Perform the actual workgroup reduction. To achieve a better
-          // memory access pattern, we use sequential addressing and a
-          // reversed loop. If the workgroup size is 8, the first element
-          // contains all the values with index%4==0, after the second one
-          // the values with index%2==0 and after the third one index%1==0,
-          // i.e., all values.
-          for (unsigned int stride = wgroup_size / 2; stride > 0;
-               stride >>= 1) {
-            const auto idx = local_id;
-            if (idx < stride) {
-              ValueJoin::join(selected_reducer, &local_mem[idx * value_count],
-                              &local_mem[(idx + stride) * value_count]);
-            }
-            item.barrier(sycl::access::fence_space::local_space);
-          }
-
-          // Finally, we copy the workgroup results back to global memory to
-          // be used in the next iteration. If this is the last iteration,
-          // i.e., there is only one workgroup also call final() if
-          // necessary.
-          if (local_id == 0) {
-            ValueOps::copy(
-                functor,
-                &results_ptr2[(item.get_group_linear_id()) * value_count],
-                &local_mem[0]);
-            if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
-              if (n_wgroups <= 1)
-                FunctorFinal<FunctorType, WorkTag>::final(
-                    static_cast<const FunctorType&>(functor),
-                    &results_ptr2[(item.get_group_linear_id()) * value_count]);
-          }
+          SYCLReduction::workgroup_reduction<ValueJoin, ValueOps, WorkTag>(
+              item, local_mem.get_pointer(), results_ptr2,
+              device_accessible_result_ptr, value_count, selected_reducer,
+              static_cast<const FunctorType&>(functor),
+              n_wgroups <= 1 && item.get_group_linear_id() == 0);
         });
       });
-      m_space.fence();
+      q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
 
       // FIXME_SYCL this is likely not necessary, see above
-      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
-          m_space, results_ptr, results_ptr2,
-          sizeof(*m_result_ptr) * value_count * n_wgroups);
-      m_space.fence();
+      auto deep_copy_event =
+          q.memcpy(results_ptr, results_ptr2,
+                   sizeof(*m_result_ptr) * value_count * n_wgroups);
+      q.submit_barrier(std::vector<sycl::event>{deep_copy_event});
+      last_reduction_event = deep_copy_event;
 
       first_run = false;
       size      = n_wgroups;
@@ -547,19 +612,23 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     // At this point, the reduced value is written to the entry in results_ptr
     // and all that is left is to copy it back to the given result pointer if
     // necessary.
-    if (m_result_ptr) {
+    if (m_result_ptr && !m_result_ptr_device_accessible) {
       Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
                              Kokkos::Experimental::SYCLDeviceUSMSpace>(
           m_space, m_result_ptr, results_ptr,
           sizeof(*m_result_ptr) * value_count);
-      m_space.fence();
+      m_space.fence(
+          "Kokkos::Impl::ParallelReduce::sycl_direct_launch: fence after deep "
+          "copying results back");
     }
+
+    return last_reduction_event;
   }
 
  public:
   template <typename Policy, typename Functor>
   static int max_tile_size_product(const Policy& policy, const Functor&) {
-    return policy.space().impl_internal_space_instance()->m_maxThreadsPerSM;
+    return policy.space().impl_internal_space_instance()->m_maxWorkgroupSize;
   }
 
   void execute() const {
@@ -575,16 +644,19 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     const auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_reducer, indirectReducerMem);
 
-    sycl_direct_launch(m_policy, functor_wrapper.get_functor(),
-                       reducer_wrapper.get_functor());
+    sycl::event event = sycl_direct_launch(
+        m_policy, functor_wrapper.get_functor(), reducer_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
+    reducer_wrapper.register_event(indirectReducerMem, event);
   }
 
  private:
-  FunctorType m_functor;
-  BarePolicy m_policy;
+  const FunctorType m_functor;
+  const BarePolicy m_policy;
   const Kokkos::Experimental::SYCL& m_space;
-  ReducerType m_reducer;
-  pointer_type m_result_ptr;
+  const ReducerType m_reducer;
+  const pointer_type m_result_ptr;
+  const bool m_result_ptr_device_accessible;
 };
 
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp
index 5eac6bf9da62b29b9d15697bc5061c00db504e0c..d5611c2159bbc4bf0bd6a29fb89a941f7560650a 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp
@@ -47,6 +47,7 @@
 
 #include <Kokkos_Macros.hpp>
 #include <memory>
+#include <vector>
 #if defined(KOKKOS_ENABLE_SYCL)
 
 namespace Kokkos {
@@ -86,96 +87,99 @@ class ParallelScanSYCLBase {
   void scan_internal(sycl::queue& q, const Functor& functor,
                      pointer_type global_mem, std::size_t size) const {
     // FIXME_SYCL optimize
-    constexpr size_t wgroup_size = 32;
+    constexpr size_t wgroup_size = 128;
     auto n_wgroups               = (size + wgroup_size - 1) / wgroup_size;
+    pointer_type group_results   = global_mem + n_wgroups * wgroup_size;
 
-    // FIXME_SYCL The allocation should be handled by the execution space
-    auto deleter = [&q](value_type* ptr) { sycl::free(ptr, q); };
-    std::unique_ptr<value_type[], decltype(deleter)> group_results_memory(
-        static_cast<pointer_type>(sycl::malloc(sizeof(value_type) * n_wgroups,
-                                               q, sycl::usm::alloc::shared)),
-        deleter);
-    auto group_results = group_results_memory.get();
-
-    q.submit([&](sycl::handler& cgh) {
+    auto local_scans = q.submit([&](sycl::handler& cgh) {
       sycl::accessor<value_type, 1, sycl::access::mode::read_write,
                      sycl::access::target::local>
           local_mem(sycl::range<1>(wgroup_size), cgh);
 
-      // FIXME_SYCL we get wrong results without this, not sure why
-      sycl::stream out(1, 1, cgh);
       cgh.parallel_for(
           sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size),
           [=](sycl::nd_item<1> item) {
-            const auto local_id  = item.get_local_linear_id();
-            const auto global_id = item.get_global_linear_id();
+            const auto local_id      = item.get_local_linear_id();
+            const auto global_id     = item.get_global_linear_id();
+            const auto global_offset = global_id - local_id;
 
             // Initialize local memory
             if (global_id < size)
-              ValueOps::copy(functor, &local_mem[local_id],
-                             &global_mem[global_id]);
+              local_mem[local_id] = global_mem[global_id];
             else
               ValueInit::init(functor, &local_mem[local_id]);
             item.barrier(sycl::access::fence_space::local_space);
 
-            // Perform workgroup reduction
-            for (size_t stride = 1; 2 * stride < wgroup_size + 1; stride *= 2) {
-              auto idx = 2 * stride * (local_id + 1) - 1;
-              if (idx < wgroup_size)
-                ValueJoin::join(functor, &local_mem[idx],
-                                &local_mem[idx - stride]);
-              item.barrier(sycl::access::fence_space::local_space);
+            // subgroup scans
+            auto sg                = item.get_sub_group();
+            const auto sg_group_id = sg.get_group_id()[0];
+            const int id_in_sg     = sg.get_local_id()[0];
+            for (int stride = wgroup_size / 2; stride > 0; stride >>= 1) {
+              auto tmp = sg.shuffle_up(local_mem[local_id], stride);
+              if (id_in_sg >= stride)
+                ValueJoin::join(functor, &local_mem[local_id], &tmp);
             }
 
-            if (local_id == 0) {
-              if (n_wgroups > 1)
-                ValueOps::copy(functor,
-                               &group_results[item.get_group_linear_id()],
-                               &local_mem[wgroup_size - 1]);
-              else
-                ValueInit::init(functor,
-                                &group_results[item.get_group_linear_id()]);
-              ValueInit::init(functor, &local_mem[wgroup_size - 1]);
-            }
+            const int local_range = sg.get_local_range()[0];
+            if (id_in_sg == local_range - 1)
+              global_mem[sg_group_id + global_offset] = local_mem[local_id];
+            local_mem[local_id] = sg.shuffle_up(local_mem[local_id], 1);
+            if (id_in_sg == 0) ValueInit::init(functor, &local_mem[local_id]);
+            item.barrier(sycl::access::fence_space::local_space);
 
-            // Add results to all items
-            for (size_t stride = wgroup_size / 2; stride > 0; stride /= 2) {
-              auto idx = 2 * stride * (local_id + 1) - 1;
-              if (idx < wgroup_size) {
-                value_type dummy;
-                ValueOps::copy(functor, &dummy, &local_mem[idx - stride]);
-                ValueOps::copy(functor, &local_mem[idx - stride],
-                               &local_mem[idx]);
-                ValueJoin::join(functor, &local_mem[idx], &dummy);
+            // scan subgroup results using the first subgroup
+            if (sg_group_id == 0) {
+              const int n_subgroups = sg.get_group_range()[0];
+              if (local_range < n_subgroups) Kokkos::abort("Not implemented!");
+
+              for (int stride = n_subgroups / 2; stride > 0; stride >>= 1) {
+                auto tmp =
+                    sg.shuffle_up(global_mem[id_in_sg + global_offset], stride);
+                if (id_in_sg >= stride) {
+                  if (id_in_sg < n_subgroups)
+                    ValueJoin::join(
+                        functor, &global_mem[id_in_sg + global_offset], &tmp);
+                  else
+                    global_mem[id_in_sg + global_offset] = tmp;
+                }
               }
-              item.barrier(sycl::access::fence_space::local_space);
             }
+            item.barrier(sycl::access::fence_space::local_space);
+
+            // add results to all subgroups
+            if (sg_group_id > 0)
+              ValueJoin::join(functor, &local_mem[local_id],
+                              &global_mem[sg_group_id - 1 + global_offset]);
+            item.barrier(sycl::access::fence_space::local_space);
+            if (n_wgroups > 1 && local_id == wgroup_size - 1)
+              group_results[item.get_group_linear_id()] =
+                  global_mem[sg_group_id + global_offset];
+            item.barrier(sycl::access::fence_space::local_space);
 
             // Write results to global memory
-            if (global_id < size)
-              ValueOps::copy(functor, &global_mem[global_id],
-                             &local_mem[local_id]);
+            if (global_id < size) global_mem[global_id] = local_mem[local_id];
           });
     });
-
-    if (n_wgroups > 1) scan_internal(q, functor, group_results, n_wgroups);
-    m_policy.space().fence();
-
-    q.submit([&](sycl::handler& cgh) {
-      cgh.parallel_for(sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size),
-                       [=](sycl::nd_item<1> item) {
-                         const auto global_id = item.get_global_linear_id();
-                         if (global_id < size)
-                           ValueJoin::join(
-                               functor, &global_mem[global_id],
-                               &group_results[item.get_group_linear_id()]);
-                       });
-    });
-    m_policy.space().fence();
+    q.submit_barrier(std::vector<sycl::event>{local_scans});
+
+    if (n_wgroups > 1) {
+      scan_internal(q, functor, group_results, n_wgroups);
+      auto update_with_group_results = q.submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(
+            sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size),
+            [=](sycl::nd_item<1> item) {
+              const auto global_id = item.get_global_linear_id();
+              if (global_id < size)
+                ValueJoin::join(functor, &global_mem[global_id],
+                                &group_results[item.get_group_linear_id()]);
+            });
+      });
+      q.submit_barrier(std::vector<sycl::event>{update_with_group_results});
+    }
   }
 
   template <typename Functor>
-  void sycl_direct_launch(const Functor& functor) const {
+  sycl::event sycl_direct_launch(const Functor& functor) const {
     // Convenience references
     const Kokkos::Experimental::SYCL& space = m_policy.space();
     Kokkos::Experimental::Impl::SYCLInternal& instance =
@@ -185,7 +189,7 @@ class ParallelScanSYCLBase {
     const std::size_t len = m_policy.end() - m_policy.begin();
 
     // Initialize global memory
-    q.submit([&](sycl::handler& cgh) {
+    auto initialize_global_memory = q.submit([&](sycl::handler& cgh) {
       auto global_mem = m_scratch_space;
       auto begin      = m_policy.begin();
       cgh.parallel_for(sycl::range<1>(len), [=](sycl::item<1> item) {
@@ -197,29 +201,30 @@ class ParallelScanSYCLBase {
           functor(id, update, false);
         else
           functor(WorkTag(), id, update, false);
-        ValueOps::copy(functor, &global_mem[id], &update);
+        global_mem[id] = update;
       });
     });
-    space.fence();
+    q.submit_barrier(std::vector<sycl::event>{initialize_global_memory});
 
-    // Perform the actual exlcusive scan
+    // Perform the actual exclusive scan
     scan_internal(q, functor, m_scratch_space, len);
 
     // Write results to global memory
-    q.submit([&](sycl::handler& cgh) {
+    auto update_global_results = q.submit([&](sycl::handler& cgh) {
       auto global_mem = m_scratch_space;
       cgh.parallel_for(sycl::range<1>(len), [=](sycl::item<1> item) {
-        auto global_id = item.get_id();
+        auto global_id = item.get_id(0);
 
         value_type update = global_mem[global_id];
         if constexpr (std::is_same<WorkTag, void>::value)
           functor(global_id, update, true);
         else
           functor(WorkTag(), global_id, update, true);
-        ValueOps::copy(functor, &global_mem[global_id], &update);
+        global_mem[global_id] = update;
       });
     });
-    space.fence();
+    q.submit_barrier(std::vector<sycl::event>{update_global_results});
+    return update_global_results;
   }
 
  public:
@@ -227,28 +232,39 @@ class ParallelScanSYCLBase {
   void impl_execute(const PostFunctor& post_functor) {
     if (m_policy.begin() == m_policy.end()) return;
 
-    const auto& q = *m_policy.space().impl_internal_space_instance()->m_queue;
+    auto& instance        = *m_policy.space().impl_internal_space_instance();
     const std::size_t len = m_policy.end() - m_policy.begin();
 
-    // FIXME_SYCL The allocation should be handled by the execution space
-    // consider only storing one value per block and recreate initial results in
-    // the end before doing the final pass
-    auto deleter = [&q](value_type* ptr) { sycl::free(ptr, q); };
-    std::unique_ptr<value_type[], decltype(deleter)> result_memory(
-        static_cast<pointer_type>(sycl::malloc(sizeof(value_type) * len, q,
-                                               sycl::usm::alloc::shared)),
-        deleter);
-    m_scratch_space = result_memory.get();
+    // Compute the total amount of memory we will need. We emulate the recursive
+    // structure that is used to do the actual scan. Essentially, we need to
+    // allocate memory for the whole range and then recursively for the reduced
+    // group results until only one group is left.
+    std::size_t total_memory = 0;
+    {
+      size_t wgroup_size   = 128;
+      size_t n_nested_size = len;
+      size_t n_nested_wgroups;
+      do {
+        n_nested_wgroups = (n_nested_size + wgroup_size - 1) / wgroup_size;
+        n_nested_size    = n_nested_wgroups;
+        total_memory += sizeof(value_type) * n_nested_wgroups * wgroup_size;
+      } while (n_nested_wgroups > 1);
+      total_memory += sizeof(value_type) * wgroup_size;
+    }
+
+    // FIXME_SYCL consider only storing one value per block and recreate initial
+    // results in the end before doing the final pass
+    m_scratch_space =
+        static_cast<pointer_type>(instance.scratch_space(total_memory));
 
     Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem&
-        indirectKernelMem = m_policy.space()
-                                .impl_internal_space_instance()
-                                ->m_indirectKernelMem;
+        indirectKernelMem = instance.m_indirectKernelMem;
 
     const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_functor, indirectKernelMem);
 
-    sycl_direct_launch(functor_wrapper.get_functor());
+    sycl::event event = sycl_direct_launch(functor_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
     post_functor();
   }
 
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
index 738620926b5496b9710ce001b77c6fb625325320..9538bf708077cc50404e66e19e048d4341a19761 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
@@ -47,8 +47,11 @@
 
 #include <Kokkos_Parallel.hpp>
 
+#include <SYCL/Kokkos_SYCL_Parallel_Reduce.hpp>  // workgroup_reduction
 #include <SYCL/Kokkos_SYCL_Team.hpp>
 
+#include <vector>
+
 namespace Kokkos {
 namespace Impl {
 template <typename... Properties>
@@ -63,8 +66,6 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
   friend class TeamPolicyInternal;
 
  private:
-  static int constexpr MAX_WARP = 8;
-
   typename traits::execution_space m_space;
   int m_league_size;
   int m_team_size;
@@ -128,11 +129,18 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
   }
   inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
   inline bool impl_auto_team_size() const { return m_tune_team_size; }
+  // FIXME_SYCL This is correct in most cases, but not necessarily in case a
+  // custom sycl::queue is used to initialize the execution space.
   static int vector_length_max() {
-    // FIXME_SYCL provide a reasonable value
-    return 1;
+    std::vector<size_t> sub_group_sizes =
+        execution_space{}
+            .impl_internal_space_instance()
+            ->m_queue->get_device()
+            .template get_info<sycl::info::device::sub_group_sizes>();
+    return *std::max_element(sub_group_sizes.begin(), sub_group_sizes.end());
   }
 
+ private:
   static int verify_requested_vector_length(int requested_vector_length) {
     int test_vector_length =
         std::min(requested_vector_length, vector_length_max());
@@ -140,18 +148,14 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
     // Allow only power-of-two vector_length
     if (!(is_integral_power_of_two(test_vector_length))) {
       int test_pow2 = 1;
-      for (int i = 0; i < 5; i++) {
-        test_pow2 = test_pow2 << 1;
-        if (test_pow2 > test_vector_length) {
-          break;
-        }
-      }
+      while (test_pow2 < test_vector_length) test_pow2 <<= 1;
       test_vector_length = test_pow2 >> 1;
     }
 
     return test_vector_length;
   }
 
+ public:
   static int scratch_size_max(int level) {
     return level == 0 ? 1024 * 32
                       :           // FIXME_SYCL arbitrarily setting this to 32kB
@@ -160,7 +164,9 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
   inline void impl_set_vector_length(size_t size) { m_vector_length = size; }
   inline void impl_set_team_size(size_t size) { m_team_size = size; }
   int impl_vector_length() const { return m_vector_length; }
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   KOKKOS_DEPRECATED int vector_length() const { return impl_vector_length(); }
+#endif
 
   int team_size() const { return m_team_size; }
 
@@ -206,7 +212,21 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
         m_chunk_size(0),
         m_tune_team_size(bool(team_size_request <= 0)),
         m_tune_vector_length(bool(vector_length_request <= 0)) {
-    // FIXME_SYCL check paramters
+    // FIXME_SYCL Check that league size is permissible,
+    // https://github.com/intel/llvm/pull/4064
+
+    // Make sure total block size is permissible
+    if (m_team_size * m_vector_length >
+        static_cast<int>(
+            m_space.impl_internal_space_instance()->m_maxWorkgroupSize)) {
+      Impl::throw_runtime_exception(
+          std::string("Kokkos::TeamPolicy<SYCL> the team size is too large. "
+                      "Team size x vector length is " +
+                      std::to_string(m_team_size * m_vector_length) +
+                      " but must be smaller than ") +
+          std::to_string(
+              m_space.impl_internal_space_instance()->m_maxWorkgroupSize));
+    }
   }
 
   /** \brief  Specify league size, request team size */
@@ -311,8 +331,9 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
          2 * sizeof(double) - m_team_scratch_size[0]) /
         (sizeof(double) + m_thread_scratch_size[0]);
     return std::min<int>(
-        m_space.impl_internal_space_instance()->m_maxWorkgroupSize,
-        max_threads_for_memory);
+               m_space.impl_internal_space_instance()->m_maxWorkgroupSize,
+               max_threads_for_memory) /
+           impl_vector_length();
   }
 
   template <class FunctorType>
@@ -335,8 +356,9 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
         (sizeof(double) + sizeof(value_type) * value_count +
          m_thread_scratch_size[0]);
     return std::min<int>(
-        m_space.impl_internal_space_instance()->m_maxWorkgroupSize,
-        max_threads_for_memory);
+               m_space.impl_internal_space_instance()->m_maxWorkgroupSize,
+               max_threads_for_memory) /
+           impl_vector_length();
   }
 
   template <class FunctorType>
@@ -376,14 +398,15 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   int m_scratch_size[2];
 
   template <typename Functor>
-  void sycl_direct_launch(const Policy& policy, const Functor& functor) const {
+  sycl::event sycl_direct_launch(const Policy& policy,
+                                 const Functor& functor) const {
     // Convenience references
     const Kokkos::Experimental::SYCL& space = policy.space();
     Kokkos::Experimental::Impl::SYCLInternal& instance =
         *space.impl_internal_space_instance();
     sycl::queue& q = *instance.m_queue;
 
-    q.submit([&](sycl::handler& cgh) {
+    auto parallel_for_event = q.submit([&](sycl::handler& cgh) {
       // FIXME_SYCL accessors seem to need a size greater than zero at least for
       // host queues
       sycl::accessor<char, 1, sycl::access::mode::read_write,
@@ -399,14 +422,22 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
       cgh.parallel_for(
           sycl::nd_range<2>(
-              sycl::range<2>(m_league_size * m_team_size, m_vector_size),
+              sycl::range<2>(m_team_size, m_league_size * m_vector_size),
               sycl::range<2>(m_team_size, m_vector_size)),
           [=](sycl::nd_item<2> item) {
+#ifdef KOKKOS_ENABLE_DEBUG
+            if (item.get_sub_group().get_local_range() %
+                    item.get_local_range(1) !=
+                0)
+              Kokkos::abort(
+                  "The sub_group size is not divisible by the vector_size. "
+                  "Choose a smaller vector_size!");
+#endif
             const member_type team_member(
                 team_scratch_memory_L0.get_pointer(), shmem_begin,
                 scratch_size[0],
                 static_cast<char*>(scratch_ptr[1]) +
-                    item.get_group(0) * scratch_size[1],
+                    item.get_group(1) * scratch_size[1],
                 scratch_size[1], item);
             if constexpr (std::is_same<work_tag, void>::value)
               functor(team_member);
@@ -414,7 +445,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
               functor(work_tag(), team_member);
           });
     });
-    space.fence();
+    q.submit_barrier(std::vector<sycl::event>{parallel_for_event});
+    return parallel_for_event;
   }
 
  public:
@@ -429,7 +461,9 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_functor, indirectKernelMem);
 
-    sycl_direct_launch(m_policy, functor_wrapper.get_functor());
+    sycl::event event =
+        sycl_direct_launch(m_policy, functor_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
   }
 
   ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy)
@@ -451,11 +485,10 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // FIXME_SYCL so far accessors used instead of these pointers
     // Functor's reduce memory, team scan memory, and team shared memory depend
     // upon team size.
-    const auto& space    = *m_policy.space().impl_internal_space_instance();
-    const sycl::queue& q = *space.m_queue;
-    m_scratch_ptr[0]     = nullptr;
-    m_scratch_ptr[1]     = sycl::malloc_device(
-        sizeof(char) * m_scratch_size[1] * m_league_size, q);
+    auto& space      = *m_policy.space().impl_internal_space_instance();
+    m_scratch_ptr[0] = nullptr;
+    m_scratch_ptr[1] = space.resize_team_scratch_space(
+        static_cast<ptrdiff_t>(m_scratch_size[1]) * m_league_size);
 
     if (static_cast<int>(space.m_maxShmemPerBlock) <
         m_shmem_size - m_shmem_begin) {
@@ -463,27 +496,17 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
       out << "Kokkos::Impl::ParallelFor<SYCL> insufficient shared memory! "
              "Requested "
           << m_shmem_size - m_shmem_begin << " bytes but maximum is "
-          << m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock
-          << '\n';
+          << space.m_maxShmemPerBlock << '\n';
       Kokkos::Impl::throw_runtime_exception(out.str());
     }
 
+    const auto max_team_size =
+        m_policy.team_size_max(arg_functor, ParallelForTag{});
     if (m_team_size > m_policy.team_size_max(arg_functor, ParallelForTag{}))
       Kokkos::Impl::throw_runtime_exception(
-          "Kokkos::Impl::ParallelFor<SYCL> requested too large team size.");
-  }
-
-  // FIXME_SYCL remove when managing m_scratch_ptr[1] in the execution space
-  // instance
-  ParallelFor(const ParallelFor&) = delete;
-  ParallelFor& operator=(const ParallelFor&) = delete;
-
-  ~ParallelFor() {
-    const Kokkos::Experimental::SYCL& space = m_policy.space();
-    Kokkos::Experimental::Impl::SYCLInternal& instance =
-        *space.impl_internal_space_instance();
-    sycl::queue& q = *instance.m_queue;
-    sycl::free(m_scratch_ptr[1], q);
+          "Kokkos::Impl::ParallelFor<SYCL> requested too large team size. The "
+          "maximal team_size is " +
+          std::to_string(max_team_size) + '!');
   }
 };
 
@@ -516,6 +539,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   const Policy m_policy;
   const ReducerType m_reducer;
   const pointer_type m_result_ptr;
+  const bool m_result_ptr_device_accessible;
   // FIXME_SYCL avoid reallocating memory for reductions
   /*  size_type* m_scratch_space;
     size_type* m_scratch_flags;
@@ -529,8 +553,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   const size_type m_vector_size;
 
   template <typename PolicyType, typename Functor, typename Reducer>
-  void sycl_direct_launch(const PolicyType& policy, const Functor& functor,
-                          const Reducer& reducer) const {
+  sycl::event sycl_direct_launch(const PolicyType& policy,
+                                 const Functor& functor,
+                                 const Reducer& reducer) const {
     using ReducerConditional =
         Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
                            FunctorType, ReducerType>;
@@ -553,25 +578,25 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     sycl::queue& q = *instance.m_queue;
 
     // FIXME_SYCL optimize
-    const size_t wgroup_size = m_team_size;
-    std::size_t size         = m_league_size * m_team_size;
+    const size_t wgroup_size = m_team_size * m_vector_size;
+    std::size_t size         = m_league_size * m_team_size * m_vector_size;
     const auto init_size =
         std::max<std::size_t>((size + wgroup_size - 1) / wgroup_size, 1);
     const unsigned int value_count =
         FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>::value_count(
             selected_reducer);
-    // FIXME_SYCL only use the first half
     const auto results_ptr = static_cast<pointer_type>(instance.scratch_space(
-        sizeof(value_type) * std::max(value_count, 1u) * init_size * 2));
-    // FIXME_SYCL without this we are running into a race condition
-    const auto results_ptr2 =
-        results_ptr + std::max(value_count, 1u) * init_size;
+        sizeof(value_type) * std::max(value_count, 1u) * init_size));
+    value_type* device_accessible_result_ptr =
+        m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+
+    sycl::event last_reduction_event;
 
     // If size<=1 we only call init(), the functor and possibly final once
     // working with the global scratch memory but don't copy back to
     // m_result_ptr yet.
     if (size <= 1) {
-      q.submit([&](sycl::handler& cgh) {
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         // FIXME_SYCL accessors seem to need a size greater than zero at least
         // for host queues
         sycl::accessor<char, 1, sycl::access::mode::read_write,
@@ -606,9 +631,13 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
               if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
                 FunctorFinal<FunctorType, WorkTag>::final(
                     static_cast<const FunctorType&>(functor), results_ptr);
+              if (device_accessible_result_ptr)
+                ValueOps::copy(functor, device_accessible_result_ptr,
+                               &results_ptr[0]);
             });
       });
-      space.fence();
+      q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
+      last_reduction_event = parallel_reduce_event;
     }
 
     // Otherwise, we perform a reduction on the values in all workgroups
@@ -617,8 +646,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // value.
     bool first_run = true;
     while (size > 1) {
-      auto n_wgroups = (size + wgroup_size - 1) / wgroup_size;
-      q.submit([&](sycl::handler& cgh) {
+      auto n_wgroups             = (size + wgroup_size - 1) / wgroup_size;
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         sycl::accessor<value_type, 1, sycl::access::mode::read_write,
                        sycl::access::target::local>
             local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u),
@@ -638,9 +667,17 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
         cgh.parallel_for(
             sycl::nd_range<2>(
-                sycl::range<2>(m_league_size * m_team_size, m_vector_size),
+                sycl::range<2>(m_team_size, m_league_size * m_vector_size),
                 sycl::range<2>(m_team_size, m_vector_size)),
             [=](sycl::nd_item<2> item) {
+#ifdef KOKKOS_ENABLE_DEBUG
+              if (first_run && item.get_sub_group().get_local_range() %
+                                       item.get_local_range(1) !=
+                                   0)
+                Kokkos::abort(
+                    "The sub_group size is not divisible by the vector_size. "
+                    "Choose a smaller vector_size!");
+#endif
               const auto local_id = item.get_local_linear_id();
               const auto global_id =
                   wgroup_size * item.get_group_linear_id() + local_id;
@@ -651,9 +688,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
               // In the first iteration, we call functor to initialize the local
               // memory. Otherwise, the local memory is initialized with the
               // results from the previous iteration that are stored in global
-              // memory. Note that we load values_per_thread values per thread
-              // and immediately combine them to avoid too many threads being
-              // idle in the actual workgroup reduction.
+              // memory.
               if (first_run) {
                 reference_type update = ValueInit::init(
                     selected_reducer, &local_mem[local_id * value_count]);
@@ -661,7 +696,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                     team_scratch_memory_L0.get_pointer(), shmem_begin,
                     scratch_size[0],
                     static_cast<char*>(scratch_ptr[1]) +
-                        item.get_group(0) * scratch_size[1],
+                        item.get_group(1) * scratch_size[1],
                     scratch_size[1], item);
                 if constexpr (std::is_same<WorkTag, void>::value)
                   functor(team_member, update);
@@ -678,50 +713,18 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
               }
               item.barrier(sycl::access::fence_space::local_space);
 
-              // Perform the actual workgroup reduction. To achieve a better
-              // memory access pattern, we use sequential addressing and a
-              // reversed loop. If the workgroup size is 8, the first element
-              // contains all the values with index%4==0, after the second one
-              // the values with index%2==0 and after the third one index%1==0,
-              // i.e., all values.
-              for (unsigned int stride = wgroup_size / 2; stride > 0;
-                   stride >>= 1) {
-                const auto idx = local_id;
-                if (idx < stride) {
-                  ValueJoin::join(selected_reducer,
-                                  &local_mem[idx * value_count],
-                                  &local_mem[(idx + stride) * value_count]);
-                }
-                item.barrier(sycl::access::fence_space::local_space);
-              }
+              SYCLReduction::workgroup_reduction<ValueJoin, ValueOps, WorkTag>(
+                  item, local_mem.get_pointer(), results_ptr,
+                  device_accessible_result_ptr, value_count, selected_reducer,
+                  static_cast<const FunctorType&>(functor),
+                  n_wgroups <= 1 && item.get_group_linear_id() == 0);
 
-              // Finally, we copy the workgroup results back to global memory to
-              // be used in the next iteration. If this is the last iteration,
-              // i.e., there is only one workgroup also call final() if
-              // necessary.
-              if (local_id == 0) {
-                ValueOps::copy(
-                    functor,
-                    &results_ptr2[(item.get_group_linear_id()) * value_count],
-                    &local_mem[0]);
-                if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
-                  if (n_wgroups <= 1 && item.get_group_linear_id() == 0) {
-                    FunctorFinal<FunctorType, WorkTag>::final(
-                        static_cast<const FunctorType&>(functor),
-                        &results_ptr2[(item.get_group_linear_id()) *
-                                      value_count]);
-                  }
-              }
+              // FIXME_SYCL not quite sure why this is necessary
+              item.barrier(sycl::access::fence_space::global_space);
             });
       });
-      space.fence();
-
-      // FIXME_SYCL this is likely not necessary, see above
-      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
-          space, results_ptr, results_ptr2,
-          sizeof(*m_result_ptr) * value_count * n_wgroups);
-      space.fence();
+      q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
+      last_reduction_event = parallel_reduce_event;
 
       first_run = false;
       size      = n_wgroups;
@@ -730,13 +733,17 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // At this point, the reduced value is written to the entry in results_ptr
     // and all that is left is to copy it back to the given result pointer if
     // necessary.
-    if (m_result_ptr) {
+    if (m_result_ptr && !m_result_ptr_device_accessible) {
       Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
                              Kokkos::Experimental::SYCLDeviceUSMSpace>(
           space, m_result_ptr, results_ptr,
           sizeof(*m_result_ptr) * value_count);
-      space.fence();
+      space.fence(
+          "Kokkos::Impl::ParallelReduce<TeamPolicy,SYCL>: fence because "
+          "reduction can't access result storage location");
     }
+
+    return last_reduction_event;
   }
 
  public:
@@ -753,8 +760,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     const auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_reducer, indirectReducerMem);
 
-    sycl_direct_launch(m_policy, functor_wrapper.get_functor(),
-                       reducer_wrapper.get_functor());
+    sycl::event event = sycl_direct_launch(
+        m_policy, functor_wrapper.get_functor(), reducer_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
+    reducer_wrapper.register_event(indirectReducerMem, event);
   }
 
  private:
@@ -779,11 +788,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // FIXME_SYCL so far accessors used instead of these pointers
     // Functor's reduce memory, team scan memory, and team shared memory depend
     // upon team size.
-    const auto& space    = *m_policy.space().impl_internal_space_instance();
-    const sycl::queue& q = *space.m_queue;
-    m_scratch_ptr[0]     = nullptr;
-    m_scratch_ptr[1]     = sycl::malloc_device(
-        sizeof(char) * m_scratch_size[1] * m_league_size, q);
+    auto& space      = *m_policy.space().impl_internal_space_instance();
+    m_scratch_ptr[0] = nullptr;
+    m_scratch_ptr[1] = space.resize_team_scratch_space(
+        static_cast<ptrdiff_t>(m_scratch_size[1]) * m_league_size);
 
     if (static_cast<int>(space.m_maxShmemPerBlock) <
         m_shmem_size - m_shmem_begin) {
@@ -791,8 +799,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
       out << "Kokkos::Impl::ParallelFor<SYCL> insufficient shared memory! "
              "Requested "
           << m_shmem_size - m_shmem_begin << " bytes but maximum is "
-          << m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock
-          << '\n';
+          << space.m_maxShmemPerBlock << '\n';
       Kokkos::Impl::throw_runtime_exception(out.str());
     }
 
@@ -811,6 +818,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_policy(arg_policy),
         m_reducer(InvalidType()),
         m_result_ptr(arg_result.data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename ViewType::memory_space>::accessible),
         m_league_size(arg_policy.league_size()),
         m_team_size(arg_policy.team_size()),
         m_vector_size(arg_policy.impl_vector_length()) {
@@ -823,6 +833,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_policy(arg_policy),
         m_reducer(reducer),
         m_result_ptr(reducer.view().data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible),
         m_league_size(arg_policy.league_size()),
         m_team_size(arg_policy.team_size()),
         m_vector_size(arg_policy.impl_vector_length()) {
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp
index 75741438e295c543db2737e6943ea52e244d69db..6ec6204e711586b0d88d6882955d21bf830a5327 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp
@@ -56,64 +56,22 @@
 /*--------------------------------------------------------------------------*/
 namespace Kokkos {
 namespace Impl {
-namespace {
-auto USM_memcpy(sycl::queue& q, void* dst, const void* src, size_t n) {
-  return q.memcpy(dst, src, n);
-}
-
-void USM_memcpy(Kokkos::Experimental::Impl::SYCLInternal& space, void* dst,
-                const void* src, size_t n) {
-  (void)USM_memcpy(*space.m_queue, dst, src, n);
-}
-
-void USM_memcpy(void* dst, const void* src, size_t n) {
-  Experimental::SYCL().fence();
-  auto event = USM_memcpy(
-      *Experimental::Impl::SYCLInternal::singleton().m_queue, dst, src, n);
-  Experimental::Impl::SYCLInternal::fence(event);
-}
-}  // namespace
-
-DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-         Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::Experimental::SYCL>::
-    DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst,
-             const void* src, size_t n) {
-  USM_memcpy(*instance.impl_internal_space_instance(), dst, src, n);
-}
 
-DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-         Kokkos::Experimental::SYCLDeviceUSMSpace,
-         Kokkos::Experimental::SYCL>::DeepCopy(void* dst, const void* src,
-                                               size_t n) {
-  USM_memcpy(dst, src, n);
+void DeepCopySYCL(void* dst, const void* src, size_t n) {
+  Experimental::SYCL().fence("Kokkos::Impl::DeepCopySYCL: fence before memcpy");
+  Experimental::Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n);
+  Experimental::SYCL().fence("Kokkos::Impl::DeepCopySYCL: fence after memcpy");
 }
 
-DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-         Kokkos::Experimental::SYCL>::DeepCopy(const Kokkos::Experimental::SYCL&
-                                                   instance,
-                                               void* dst, const void* src,
-                                               size_t n) {
-  USM_memcpy(*instance.impl_internal_space_instance(), dst, src, n);
+void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst,
+                       const void* src, size_t n) {
+  instance.impl_internal_space_instance()->m_queue->memcpy(dst, src, n);
 }
 
-DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-         Kokkos::Experimental::SYCL>::DeepCopy(void* dst, const void* src,
-                                               size_t n) {
-  USM_memcpy(dst, src, n);
-}
-
-DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
-         Kokkos::Experimental::SYCL>::DeepCopy(const Kokkos::Experimental::SYCL&
-                                                   instance,
-                                               void* dst, const void* src,
-                                               size_t n) {
-  USM_memcpy(*instance.impl_internal_space_instance(), dst, src, n);
-}
-
-DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
-         Kokkos::Experimental::SYCL>::DeepCopy(void* dst, const void* src,
-                                               size_t n) {
-  USM_memcpy(dst, src, n);
+void DeepCopyAsyncSYCL(void* dst, const void* src, size_t n) {
+  Experimental::Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n);
+  Experimental::SYCL().fence(
+      "Kokkos::Impl::DeepCopyAsyncSYCL: fence after memcpy");
 }
 
 }  // namespace Impl
@@ -135,6 +93,11 @@ SYCLSharedUSMSpace::SYCLSharedUSMSpace()
 SYCLSharedUSMSpace::SYCLSharedUSMSpace(sycl::queue queue)
     : m_queue(std::move(queue)) {}
 
+SYCLHostUSMSpace::SYCLHostUSMSpace()
+    : m_queue(*SYCL().impl_internal_space_instance()->m_queue) {}
+SYCLHostUSMSpace::SYCLHostUSMSpace(sycl::queue queue)
+    : m_queue(std::move(queue)) {}
+
 void* allocate_sycl(
     const char* arg_label, const size_t arg_alloc_size,
     const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle,
@@ -184,6 +147,19 @@ void* SYCLSharedUSMSpace::allocate(const char* arg_label,
       sycl::usm::alloc::shared, m_queue);
 }
 
+void* SYCLHostUSMSpace::allocate(const size_t arg_alloc_size) const {
+  return allocate("[unlabeled]", arg_alloc_size);
+}
+void* SYCLHostUSMSpace::allocate(const char* arg_label,
+                                 const size_t arg_alloc_size,
+                                 const size_t arg_logical_size) const {
+  return allocate_sycl(
+      arg_label, arg_alloc_size, arg_logical_size,
+      Kokkos::Tools::make_space_handle(name()),
+      RawMemoryAllocationFailure::AllocationMechanism::SYCLMallocHost,
+      sycl::usm::alloc::host, m_queue);
+}
+
 void sycl_deallocate(const char* arg_label, void* const arg_alloc_ptr,
                      const size_t arg_alloc_size, const size_t arg_logical_size,
                      const Kokkos::Tools::SpaceHandle arg_handle,
@@ -195,6 +171,8 @@ void sycl_deallocate(const char* arg_label, void* const arg_alloc_ptr,
                                       reported_size);
   }
 
+  SYCL::impl_static_fence(
+      "Kokkos::Impl::sycl_deallocate: fence before deallocate");
   sycl::free(arg_alloc_ptr, queue);
 }
 
@@ -223,6 +201,19 @@ void SYCLSharedUSMSpace::deallocate(const char* arg_label,
                   Kokkos::Tools::make_space_handle(name()), m_queue);
 }
 
+void SYCLHostUSMSpace::deallocate(void* const arg_alloc_ptr,
+                                  const size_t arg_alloc_size) const {
+  deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size);
+}
+
+void SYCLHostUSMSpace::deallocate(const char* arg_label,
+                                  void* const arg_alloc_ptr,
+                                  const size_t arg_alloc_size,
+                                  const size_t arg_logical_size) const {
+  sycl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size,
+                  Kokkos::Tools::make_space_handle(name()), m_queue);
+}
+
 }  // namespace Experimental
 }  // namespace Kokkos
 
@@ -235,6 +226,9 @@ SharedAllocationRecord<void, void> SharedAllocationRecord<
 
 SharedAllocationRecord<void, void> SharedAllocationRecord<
     Kokkos::Experimental::SYCLSharedUSMSpace, void>::s_root_record;
+
+SharedAllocationRecord<void, void> SharedAllocationRecord<
+    Kokkos::Experimental::SYCLHostUSMSpace, void>::s_root_record;
 #endif
 
 SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>::
@@ -282,6 +276,27 @@ SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, void>::
                                                   arg_label);
 }
 
+SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace, void>::
+    SharedAllocationRecord(
+        const Kokkos::Experimental::SYCLHostUSMSpace& arg_space,
+        const std::string& arg_label, const size_t arg_alloc_size,
+        const SharedAllocationRecord<void, void>::function_type arg_dealloc)
+    // Pass through allocated [ SharedAllocationHeader , user_memory ]
+    // Pass through deallocation function
+    : base_t(
+#ifdef KOKKOS_ENABLE_DEBUG
+          &SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace,
+                                  void>::s_root_record,
+#endif
+          Impl::checked_allocation_with_header(arg_space, arg_label,
+                                               arg_alloc_size),
+          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc),
+      m_space(arg_space) {
+
+  this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
+                                                  arg_label);
+}
+
 }  // namespace Impl
 }  // namespace Kokkos
 
@@ -317,6 +332,17 @@ SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace,
                      alloc_size, alloc_size - sizeof(SharedAllocationHeader));
 }
 
+SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace,
+                       void>::~SharedAllocationRecord() {
+  const char* label = nullptr;
+  if (Kokkos::Profiling::profileLibraryLoaded()) {
+    label = RecordBase::m_alloc_ptr->m_label;
+  }
+  const auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size;
+  m_space.deallocate(label, SharedAllocationRecord<void, void>::m_alloc_ptr,
+                     alloc_size, alloc_size - sizeof(SharedAllocationHeader));
+}
+
 //----------------------------------------------------------------------------
 
 }  // namespace Impl
@@ -339,6 +365,8 @@ template class SharedAllocationRecordCommon<
     Kokkos::Experimental::SYCLDeviceUSMSpace>;
 template class SharedAllocationRecordCommon<
     Kokkos::Experimental::SYCLSharedUSMSpace>;
+template class SharedAllocationRecordCommon<
+    Kokkos::Experimental::SYCLHostUSMSpace>;
 
 }  // namespace Impl
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp
index a30cf2109a60ccc5934bfc6ee834a831c539d485..c405ad31a5fb6d9bb7abee273b9ff10c474b134c 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp
@@ -92,14 +92,12 @@ class SYCLTeamMember {
     return m_item.get_group_linear_id();
   }
   KOKKOS_INLINE_FUNCTION int league_size() const {
-    // FIXME_SYCL needs to be revised for vector_length>1.
-    return m_item.get_group_range(0);
+    return m_item.get_group_range(1);
   }
   KOKKOS_INLINE_FUNCTION int team_rank() const {
-    return m_item.get_local_linear_id();
+    return m_item.get_local_id(0);
   }
   KOKKOS_INLINE_FUNCTION int team_size() const {
-    // FIXME_SYCL needs to be revised for vector_length>1.
     return m_item.get_local_range(0);
   }
   KOKKOS_INLINE_FUNCTION void team_barrier() const { m_item.barrier(); }
@@ -109,8 +107,17 @@ class SYCLTeamMember {
   //--------------------------------------------------------------------------
 
   template <class ValueType>
-  KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& val,
-                                             const int thread_id) const {
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_arithmetic_v<ValueType>>
+  team_broadcast(ValueType& val, const int thread_id) const {
+    val = sycl::group_broadcast(m_item.get_group(), val,
+                                sycl::id<2>(thread_id, 0));
+  }
+
+  // FIXME_SYCL remove/adapt this overload once the Intel oneAPI implementation
+  // is conforming to the SYCL2020 standard (allowing trivially-copyable types)
+  template <class ValueType>
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<!std::is_arithmetic_v<ValueType>>
+  team_broadcast(ValueType& val, const int thread_id) const {
     // Wait for shared data write until all threads arrive here
     m_item.barrier(sycl::access::fence_space::local_space);
     if (m_item.get_local_id(1) == 0 &&
@@ -119,7 +126,7 @@ class SYCLTeamMember {
     }
     // Wait for shared data read until root thread writes
     m_item.barrier(sycl::access::fence_space::local_space);
-    val = *static_cast<ValueType*>(m_team_reduce);
+    val = *(static_cast<ValueType*>(m_team_reduce));
   }
 
   template <class Closure, class ValueType>
@@ -294,35 +301,43 @@ class SYCLTeamMember {
   //----------------------------------------
 
   template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
+  KOKKOS_INLINE_FUNCTION
       typename std::enable_if<is_reducer<ReducerType>::value>::type
-      vector_reduce(ReducerType const& reducer) {
+      vector_reduce(ReducerType const& reducer) const {
     vector_reduce(reducer, reducer.reference());
   }
 
   template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
+  KOKKOS_INLINE_FUNCTION
       typename std::enable_if<is_reducer<ReducerType>::value>::type
-      vector_reduce(ReducerType const& /*reducer*/,
-                    typename ReducerType::value_type& /*value*/) {
-    // FIXME_SYCL
-    Kokkos::abort("Not implemented!");
-  }
+      vector_reduce(ReducerType const& reducer,
+                    typename ReducerType::value_type& value) const {
+    const auto tidx1   = m_item.get_local_id(1);
+    const auto grange1 = m_item.get_local_range(1);
 
-  //--------------------------------------------------------------------------
-  /**\brief  Global reduction across all blocks
-   *
-   *  Return !0 if reducer contains the final value
-   */
-  template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
-      typename std::enable_if<is_reducer<ReducerType>::value, int>::type
-      global_reduce(ReducerType const& /*reducer*/,
-                    int* const /*global_scratch_flags*/,
-                    void* const /*global_scratch_space*/, void* const /*shmem*/,
-                    int const /*shmem_size*/) {
-    // FIXME_SYCL
-    Kokkos::abort("Not implemented!");
+    const auto sg = m_item.get_sub_group();
+
+    if (grange1 == 1) return;
+
+    // Intra vector lane shuffle reduction:
+    typename ReducerType::value_type tmp(value);
+    typename ReducerType::value_type tmp2 = tmp;
+
+    for (int i = grange1; (i >>= 1);) {
+      tmp2 = sg.shuffle_down(tmp, i);
+      if (static_cast<int>(tidx1) < i) {
+        reducer.join(tmp, tmp2);
+      }
+    }
+
+    // Broadcast from root lane to all other lanes.
+    // Cannot use "butterfly" algorithm to avoid the broadcast
+    // because floating point summation is not associative
+    // and thus different threads could have different results.
+
+    tmp2  = sg.shuffle(tmp, (sg.get_local_id() / grange1) * grange1);
+    value = tmp2;
+    reducer.reference() = tmp2;
   }
 
   //----------------------------------------
@@ -489,7 +504,6 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
     const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::SYCLTeamMember>&
         loop_boundaries,
     const Closure& closure) {
-  // FIXME_SYCL Fix for vector_length>1.
   for (iType i = loop_boundaries.start +
                  loop_boundaries.member.item().get_local_id(0);
        i < loop_boundaries.end;
@@ -516,7 +530,6 @@ KOKKOS_INLINE_FUNCTION
   typename ReducerType::value_type value;
   reducer.init(value);
 
-  // FIXME_SYCL Fix for vector_length>1.
   for (iType i = loop_boundaries.start +
                  loop_boundaries.member.item().get_local_id(0);
        i < loop_boundaries.end;
@@ -546,7 +559,6 @@ KOKKOS_INLINE_FUNCTION
 
   reducer.init(reducer.reference());
 
-  // FIXME_SYCL Fix for vector_length>1.
   for (iType i = loop_boundaries.start +
                  loop_boundaries.member.item().get_local_id(0);
        i < loop_boundaries.end;
@@ -609,11 +621,14 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
     const Impl::TeamVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>&
         loop_boundaries,
     const Closure& closure) {
-  // FIXME_SYCL adapt for vector_length != 1
-  for (iType i = loop_boundaries.start +
-                 loop_boundaries.member.item().get_local_id(0);
-       i < loop_boundaries.end;
-       i += loop_boundaries.member.item().get_local_range(0))
+  const iType tidx0 = loop_boundaries.member.item().get_local_id(0);
+  const iType tidx1 = loop_boundaries.member.item().get_local_id(1);
+
+  const iType grange0 = loop_boundaries.member.item().get_local_range(0);
+  const iType grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  for (iType i = loop_boundaries.start + tidx0 * grange1 + tidx1;
+       i < loop_boundaries.end; i += grange0 * grange1)
     closure(i);
 }
 
@@ -623,17 +638,20 @@ KOKKOS_INLINE_FUNCTION
     parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
                         iType, Impl::SYCLTeamMember>& loop_boundaries,
                     const Closure& closure, const ReducerType& reducer) {
-  // FIXME_SYCL adapt for vector_length != 1
   typename ReducerType::value_type value;
   reducer.init(value);
 
-  for (iType i = loop_boundaries.start +
-                 loop_boundaries.member.item().get_local_id(0);
-       i < loop_boundaries.end;
-       i += loop_boundaries.member.item().get_local_range(0)) {
+  const iType tidx0 = loop_boundaries.member.item().get_local_id(0);
+  const iType tidx1 = loop_boundaries.member.item().get_local_id(1);
+
+  const iType grange0 = loop_boundaries.member.item().get_local_range(0);
+  const iType grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  for (iType i = loop_boundaries.start + tidx0 * grange1 + tidx1;
+       i < loop_boundaries.end; i += grange0 * grange1)
     closure(i, value);
-  }
 
+  loop_boundaries.member.vector_reduce(reducer, value);
   loop_boundaries.member.team_reduce(reducer, value);
 }
 
@@ -643,20 +661,23 @@ KOKKOS_INLINE_FUNCTION
     parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
                         iType, Impl::SYCLTeamMember>& loop_boundaries,
                     const Closure& closure, ValueType& result) {
-  // FIXME_SYCL adapt for vector_length != 1
   ValueType val;
   Kokkos::Sum<ValueType> reducer(val);
 
   reducer.init(reducer.reference());
 
-  for (iType i = loop_boundaries.start +
-                 loop_boundaries.member.item().get_local_id(0);
-       i < loop_boundaries.end;
-       i += loop_boundaries.member.item().get_local_range(0)) {
+  const iType tidx0 = loop_boundaries.member.item().get_local_id(0);
+  const iType tidx1 = loop_boundaries.member.item().get_local_id(1);
+
+  const iType grange0 = loop_boundaries.member.item().get_local_range(0);
+  const iType grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  for (iType i = loop_boundaries.start + tidx0 * grange1 + tidx1;
+       i < loop_boundaries.end; i += grange0 * grange1)
     closure(i, val);
-  }
 
-  loop_boundaries.member.team_reduce(reducer, val);
+  loop_boundaries.member.vector_reduce(reducer);
+  loop_boundaries.member.team_reduce(reducer);
   result = reducer.reference();
 }
 
@@ -673,9 +694,14 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
     const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>&
         loop_boundaries,
     const Closure& closure) {
-  // FIXME_SYC: adapt for vector_length!=1
-  for (auto i = loop_boundaries.start; i != loop_boundaries.end; ++i)
+  const iType tidx1   = loop_boundaries.member.item().get_local_id(1);
+  const iType grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  for (iType i = loop_boundaries.start + tidx1; i < loop_boundaries.end;
+       i += grange1)
     closure(i);
+
+  loop_boundaries.member.item().get_sub_group().barrier();
 }
 
 //----------------------------------------------------------------------------
@@ -697,12 +723,16 @@ KOKKOS_INLINE_FUNCTION
     parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
                         iType, Impl::SYCLTeamMember> const& loop_boundaries,
                     Closure const& closure, ReducerType const& reducer) {
-  // FIXME_SYCL adapt for vector_length != 1
   reducer.init(reducer.reference());
 
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) {
+  const iType tidx1   = loop_boundaries.member.item().get_local_id(1);
+  const iType grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  for (iType i = loop_boundaries.start + tidx1; i < loop_boundaries.end;
+       i += grange1)
     closure(i, reducer.reference());
-  }
+
+  loop_boundaries.member.vector_reduce(reducer);
 }
 
 /** \brief  Intra-thread vector parallel_reduce.
@@ -722,12 +752,16 @@ KOKKOS_INLINE_FUNCTION
     parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
                         iType, Impl::SYCLTeamMember> const& loop_boundaries,
                     Closure const& closure, ValueType& result) {
-  // FIXME_SYCL adapt for vector_length != 1
   result = ValueType();
 
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) {
+  const iType tidx1 = loop_boundaries.member.item().get_local_id(1);
+  const int grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  for (iType i = loop_boundaries.start + tidx1; i < loop_boundaries.end;
+       i += grange1)
     closure(i, result);
-  }
+
+  loop_boundaries.member.vector_reduce(Kokkos::Sum<ValueType>(result));
 }
 
 //----------------------------------------------------------------------------
@@ -746,15 +780,59 @@ KOKKOS_INLINE_FUNCTION
     parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
                       iType, Impl::SYCLTeamMember>& loop_boundaries,
                   const Closure& closure, const ReducerType& reducer) {
-  // FIXME_SYCL modify for vector_length!=1
   using value_type = typename Kokkos::Impl::FunctorAnalysis<
       Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
 
   value_type accum;
   reducer.init(accum);
+  const value_type identity = accum;
+
+  // Loop through boundaries by vector-length chunks must scan at each iteration
+
+  // All thread "lanes" must loop the same number of times.
+  // Determine an loop end for all thread "lanes."
+  // Requires:
+  //   grange1 is power of two and thus
+  //     ( end % grange1 ) == ( end & ( grange1 - 1 ) )
+  //   1 <= grange1 <= sub_group size
+
+  const iType tidx1   = loop_boundaries.member.item().get_local_id(1);
+  const iType grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  const int mask          = grange1 - 1;
+  const int rem           = loop_boundaries.end & mask;  // == end % grange1
+  const int end           = loop_boundaries.end + (rem ? grange1 - rem : 0);
+  const auto sg           = loop_boundaries.member.item().get_sub_group();
+  const int vector_offset = (sg.get_local_id() / grange1) * grange1;
+
+  for (int i = tidx1; i < end; i += grange1) {
+    value_type val = identity;
+
+    // First acquire per-lane contributions.
+    // This sets i's val to i-1's contribution to make the latter shfl_up an
+    // exclusive scan -- the final accumulation of i's val will be included in
+    // the second closure call later.
+    if (i < loop_boundaries.end && tidx1 > 0) closure(i - 1, val, false);
+
+    // Bottom up exclusive scan in triangular pattern where each SYCL thread is
+    // the root of a reduction tree from the zeroth "lane" to itself.
+    //  [t] += [t-1] if t >= 1
+    //  [t] += [t-2] if t >= 2
+    //  [t] += [t-4] if t >= 4
+    //  ...
+    for (int j = 1; j < static_cast<int>(grange1); j <<= 1) {
+      value_type tmp = sg.shuffle_up(val, j);
+      if (j <= static_cast<int>(tidx1)) {
+        reducer.join(val, tmp);
+      }
+    }
+
+    // Include accumulation
+    reducer.join(val, accum);
 
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) {
-    closure(i, accum, true);
+    // Update i's contribution into the val and add it to accum for next round
+    if (i < loop_boundaries.end) closure(i, val, true);
+    accum = sg.shuffle(val, mask + vector_offset);
   }
 }
 
@@ -792,21 +870,26 @@ template <class FunctorType>
 KOKKOS_INLINE_FUNCTION void single(
     const Impl::ThreadSingleStruct<Impl::SYCLTeamMember>& single_struct,
     const FunctorType& lambda) {
-  if (single_struct.team_member.team_rank() == 0) lambda();
+  if (single_struct.team_member.item().get_local_linear_id() == 0) lambda();
 }
 
 template <class FunctorType, class ValueType>
 KOKKOS_INLINE_FUNCTION void single(
     const Impl::VectorSingleStruct<Impl::SYCLTeamMember>& single_struct,
     const FunctorType& lambda, ValueType& val) {
-  if (single_struct.team_member.item().get_local_id(1) == 0) lambda(val);
+  const sycl::nd_item<2> item = single_struct.team_member.item();
+  const auto grange1          = item.get_local_range(1);
+  const auto sg               = item.get_sub_group();
+  if (item.get_local_id(1) == 0) lambda(val);
+  val = sg.shuffle(val, (sg.get_local_id() / grange1) * grange1);
 }
 
 template <class FunctorType, class ValueType>
 KOKKOS_INLINE_FUNCTION void single(
     const Impl::ThreadSingleStruct<Impl::SYCLTeamMember>& single_struct,
     const FunctorType& lambda, ValueType& val) {
-  if (single_struct.team_member.team_rank() == 0) lambda(val);
+  if (single_struct.team_member.item().get_local_linear_id() == 0) lambda(val);
+  single_struct.team_member.team_broadcast(val, 0);
 }
 
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp
index 141a692f6090555cf129997a64bc9e99941f830d..d2820b3b3a34cdb933c4615260a73e1b82e7de34 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp
@@ -89,7 +89,7 @@ class UniqueToken<SYCL, UniqueTokenScope::Global> {
     const Kokkos::pair<int, int> result =
         Kokkos::Impl::concurrent_bitset::acquire_bounded(
             m_buffer, m_count
-#if defined(KOKKOS_ARCH_INTEL_GEN)
+#ifdef KOKKOS_ARCH_INTEL_GPU
             ,
             Kokkos::Impl::clock_tic() % m_count
 #endif
diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
index 92bd671bd53bf89482aee39cdd34b3391e9a01a2..18ef97ae4650ff50e4ea4a51b74ab53c88970ca4 100644
--- a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
+++ b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@@ -288,21 +288,46 @@ int ThreadsExec::in_parallel() {
   return s_current_function && (&s_threads_process != s_current_function_arg) &&
          (s_threads_process.m_pool_base || !is_process());
 }
+void ThreadsExec::fence() { internal_fence(Impl::fence_is_static::yes); }
+void ThreadsExec::fence(const std::string &name) {
+  internal_fence(name, Impl::fence_is_static::yes);
+}
+
+void ThreadsExec::internal_fence(Impl::fence_is_static is_static) {
+  internal_fence((is_static == Impl::fence_is_static::no)
+                     ? "Kokkos::ThreadsExec::fence: Unnamed Instance Fence"
+                     : "Kokkos::ThreadsExec::fence: Unnamed Global Fence",
+                 is_static);
+}
 
 // Wait for root thread to become inactive
-void ThreadsExec::fence() {
-  if (s_thread_pool_size[0]) {
-    // Wait for the root thread to complete:
-    Impl::spinwait_while_equal<int>(s_threads_exec[0]->m_pool_state,
-                                    ThreadsExec::Active);
-  }
+void ThreadsExec::internal_fence(const std::string &name,
+                                 Impl::fence_is_static is_static) {
+  const auto &fence_lam = [&]() {
+    if (s_thread_pool_size[0]) {
+      // Wait for the root thread to complete:
+      Impl::spinwait_while_equal<int>(s_threads_exec[0]->m_pool_state,
+                                      ThreadsExec::Active);
+    }
 
-  s_current_function     = nullptr;
-  s_current_function_arg = nullptr;
+    s_current_function     = nullptr;
+    s_current_function_arg = nullptr;
 
-  // Make sure function and arguments are cleared before
-  // potentially re-activating threads with a subsequent launch.
-  memory_fence();
+    // Make sure function and arguments are cleared before
+    // potentially re-activating threads with a subsequent launch.
+    memory_fence();
+  };
+  if (is_static == Impl::fence_is_static::yes) {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Threads>(
+        name,
+        Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+            GlobalDeviceSynchronization,
+        fence_lam);
+  } else {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Threads>(
+        name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1},
+        fence_lam);
+  }
 }
 
 /** \brief  Begin execution of the asynchronous functor */
@@ -769,7 +794,12 @@ void ThreadsExec::finalize() {
 namespace Kokkos {
 
 int Threads::concurrency() { return impl_thread_pool_size(0); }
-void Threads::fence() const { Impl::ThreadsExec::fence(); }
+void Threads::fence() const {
+  Impl::ThreadsExec::internal_fence(Impl::fence_is_static::no);
+}
+void Threads::fence(const std::string &name) const {
+  Impl::ThreadsExec::internal_fence(name, Impl::fence_is_static::no);
+}
 
 Threads &Threads::impl_instance(int) {
   static Threads t;
@@ -832,6 +862,9 @@ void ThreadsSpaceInitializer::finalize(const bool all_spaces) {
 }
 
 void ThreadsSpaceInitializer::fence() { Kokkos::Threads::impl_static_fence(); }
+void ThreadsSpaceInitializer::fence(const std::string &name) {
+  Kokkos::Threads::impl_static_fence(name);
+}
 
 void ThreadsSpaceInitializer::print_configuration(std::ostream &msg,
                                                   const bool detail) {
diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
index 1c8b3ac5f6a7685d2bec7d36b53fc657bf7ba1b9..4d9a72a03467977ed21867a90a84563c7254bba7 100644
--- a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
+++ b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
@@ -63,7 +63,6 @@
 
 namespace Kokkos {
 namespace Impl {
-
 class ThreadsExec {
  public:
   // Fan array has log_2(NT) reduction threads plus 2 scan threads
@@ -474,6 +473,12 @@ class ThreadsExec {
 
   static int in_parallel();
   static void fence();
+  static void fence(const std::string &);
+  static void internal_fence(
+      Impl::fence_is_static is_static = Impl::fence_is_static::yes);
+  static void internal_fence(
+      const std::string &,
+      Impl::fence_is_static is_static = Impl::fence_is_static::yes);
   static bool sleep();
   static bool wake();
 
@@ -635,7 +640,12 @@ inline void Threads::print_configuration(std::ostream &s, const bool detail) {
   Impl::ThreadsExec::print_configuration(s, detail);
 }
 
-inline void Threads::impl_static_fence() { Impl::ThreadsExec::fence(); }
+inline void Threads::impl_static_fence() {
+  Impl::ThreadsExec::internal_fence(Impl::fence_is_static::yes);
+}
+inline void Threads::impl_static_fence(const std::string &name) {
+  Impl::ThreadsExec::internal_fence(name, Impl::fence_is_static::yes);
+}
 } /* namespace Kokkos */
 
 //----------------------------------------------------------------------------
diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
index 40a09ed22ab1d6d73b62084549049521e0eb3150..e4eaeac78163efe48a2ddbd6d39920900b035c29 100644
--- a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
+++ b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
@@ -100,8 +100,8 @@ bool ThreadsExec::spawn() {
 
   pthread_attr_t attr;
 
-  if (0 == pthread_attr_init(&attr) ||
-      0 == pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM) ||
+  if (0 == pthread_attr_init(&attr) &&
+      0 == pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM) &&
       0 == pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED)) {
     pthread_t pt;
 
diff --git a/packages/kokkos/core/src/desul/.clang-format b/packages/kokkos/core/src/desul/.clang-format
new file mode 100644
index 0000000000000000000000000000000000000000..9d159247d518108410702980b90b13c2cfb4b84f
--- /dev/null
+++ b/packages/kokkos/core/src/desul/.clang-format
@@ -0,0 +1,2 @@
+DisableFormat: true
+SortIncludes: false
diff --git a/packages/kokkos/core/src/desul/atomics.hpp b/packages/kokkos/core/src/desul/atomics.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab3fe25392faa70027cb19c2a02c18c570c5768b
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics.hpp
@@ -0,0 +1,19 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_HPP_
+#define DESUL_ATOMICS_HPP_
+
+#include "desul/atomics/Macros.hpp"
+
+#include "desul/atomics/Atomic_Ref.hpp"
+#include "desul/atomics/Compare_Exchange.hpp"
+#include "desul/atomics/Generic.hpp"
+#include "desul/atomics/Lock_Array.hpp"
+
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Atomic_Ref.hpp b/packages/kokkos/core/src/desul/atomics/Atomic_Ref.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..73cd01a7e6ff9c54b8b851193bf256124f399cfe
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Atomic_Ref.hpp
@@ -0,0 +1,541 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMIC_REF_IMPL_HPP_
+#define DESUL_ATOMIC_REF_IMPL_HPP_
+
+#include <cstddef>
+#include <memory>
+#include <type_traits>
+
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/Generic.hpp"
+#include "desul/atomics/Macros.hpp"
+
+namespace desul {
+namespace Impl {
+
+// TODO current implementation is missing the following:
+// * member functions
+//   * wait
+//   * notify_one
+//   * notify_all
+
+template <typename T,
+          typename MemoryOrder,
+          typename MemoryScope,
+          bool = std::is_integral<T>{},
+          bool = std::is_floating_point<T>{}>
+struct basic_atomic_ref;
+
+// base class for non-integral, non-floating-point, non-pointer types
+template <typename T, typename MemoryOrder, typename MemoryScope>
+struct basic_atomic_ref<T, MemoryOrder, MemoryScope, false, false> {
+  static_assert(std::is_trivially_copyable<T>{}, "");
+
+ private:
+  T* _ptr;
+
+  // 1/2/4/8/16-byte types must be aligned to at least their size
+  static constexpr int _min_alignment = (sizeof(T) & (sizeof(T) - 1)) || sizeof(T) > 16
+                                            ? 0
+                                            : sizeof(T);
+
+ public:
+  using value_type = T;
+
+  static constexpr bool is_always_lock_free = atomic_always_lock_free(sizeof(T));
+
+  static constexpr std::size_t required_alignment = _min_alignment > alignof(T)
+                                                        ? _min_alignment
+                                                        : alignof(T);
+
+  basic_atomic_ref() = delete;
+  basic_atomic_ref& operator=(basic_atomic_ref const&) = delete;
+
+  basic_atomic_ref(basic_atomic_ref const&) = default;
+
+  explicit basic_atomic_ref(T& obj) : _ptr(std::addressof(obj)) {}
+
+  T operator=(T desired) const noexcept {
+    this->store(desired);
+    return desired;
+  }
+
+  operator T() const noexcept { return this->load(); }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION void store(T desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    atomic_store(_ptr, desired, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T load(_MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T exchange(T desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, desired, order, MemoryScope());
+  }
+
+  DESUL_FUNCTION bool is_lock_free() const noexcept {
+    return atomic_is_lock_free<sizeof(T), required_alignment>();
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(T& expected,
+                                            T desired,
+                                            SuccessMemoryOrder success,
+                                            FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_weak(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(
+      T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_weak(expected,
+                          desired,
+                          order,
+                          cmpexch_failure_memory_order<_MemoryOrder>(),
+                          MemoryScope());
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T& expected,
+      T desired,
+      SuccessMemoryOrder success,
+      FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_strong(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_strong(expected,
+                            desired,
+                            order,
+                            cmpexch_failure_memory_order<_MemoryOrder>(),
+                            MemoryScope());
+  }
+};
+
+// base class for atomic_ref<integral-type>
+template <typename T, typename MemoryOrder, typename MemoryScope>
+struct basic_atomic_ref<T, MemoryOrder, MemoryScope, true, false> {
+  static_assert(std::is_integral<T>{}, "");
+
+ private:
+  T* _ptr;
+
+ public:
+  using value_type = T;
+  using difference_type = value_type;
+
+  static constexpr bool is_always_lock_free = atomic_always_lock_free(sizeof(T));
+
+  static constexpr std::size_t required_alignment = sizeof(T) > alignof(T) ? sizeof(T)
+                                                                           : alignof(T);
+
+  basic_atomic_ref() = delete;
+  basic_atomic_ref& operator=(basic_atomic_ref const&) = delete;
+
+  explicit basic_atomic_ref(T& obj) : _ptr(&obj) {}
+
+  basic_atomic_ref(basic_atomic_ref const&) = default;
+
+  T operator=(T desired) const noexcept {
+    this->store(desired);
+    return desired;
+  }
+
+  operator T() const noexcept { return this->load(); }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION void store(T desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    atomic_store(_ptr, desired, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T load(_MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T exchange(T desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, desired, order, MemoryScope());
+  }
+
+  DESUL_FUNCTION bool is_lock_free() const noexcept {
+    return atomic_is_lock_free<sizeof(T), required_alignment>();
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(T& expected,
+                                            T desired,
+                                            SuccessMemoryOrder success,
+                                            FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_weak(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(
+      T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_weak(expected,
+                          desired,
+                          order,
+                          cmpexch_failure_memory_order<_MemoryOrder>(),
+                          MemoryScope());
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T& expected,
+      T desired,
+      SuccessMemoryOrder success,
+      FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_strong(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_strong(expected,
+                            desired,
+                            order,
+                            cmpexch_failure_memory_order<_MemoryOrder>(),
+                            MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_add(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_add(_ptr, arg, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_sub(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_sub(_ptr, arg, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_and(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_and(_ptr, arg, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_or(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_or(_ptr, arg, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_xor(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_xor(_ptr, arg, order, MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator++() const noexcept {
+    return atomic_add_fetch(_ptr, value_type(1), MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator++(int) const noexcept { return fetch_add(1); }
+
+  DESUL_FUNCTION value_type operator--() const noexcept {
+    return atomic_sub_fetch(_ptr, value_type(1), MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator--(int) const noexcept { return fetch_sub(1); }
+
+  DESUL_FUNCTION value_type operator+=(value_type arg) const noexcept {
+    atomic_add_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator-=(value_type arg) const noexcept {
+    atomic_sub_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator&=(value_type arg) const noexcept {
+    atomic_and_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator|=(value_type arg) const noexcept {
+    atomic_or_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator^=(value_type arg) const noexcept {
+    atomic_xor_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+};
+
+// base class for atomic_ref<floating-point-type>
+template <typename T, typename MemoryOrder, typename MemoryScope>
+struct basic_atomic_ref<T, MemoryOrder, MemoryScope, false, true> {
+  static_assert(std::is_floating_point<T>{}, "");
+
+ private:
+  T* _ptr;
+
+ public:
+  using value_type = T;
+  using difference_type = value_type;
+
+  static constexpr bool is_always_lock_free = atomic_always_lock_free(sizeof(T));
+
+  static constexpr std::size_t required_alignment = alignof(T);
+
+  basic_atomic_ref() = delete;
+  basic_atomic_ref& operator=(basic_atomic_ref const&) = delete;
+
+  explicit basic_atomic_ref(T& obj) : _ptr(&obj) {}
+
+  basic_atomic_ref(basic_atomic_ref const&) = default;
+
+  T operator=(T desired) const noexcept {
+    this->store(desired);
+    return desired;
+  }
+
+  operator T() const noexcept { return this->load(); }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION void store(T desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    atomic_store(_ptr, desired, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T load(_MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T exchange(T desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, desired, order, MemoryScope());
+  }
+
+  DESUL_FUNCTION bool is_lock_free() const noexcept {
+    return atomic_is_lock_free<sizeof(T), required_alignment>();
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(T& expected,
+                                            T desired,
+                                            SuccessMemoryOrder success,
+                                            FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_weak(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(
+      T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_weak(expected,
+                          desired,
+                          order,
+                          cmpexch_failure_memory_order<_MemoryOrder>(),
+                          MemoryScope());
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T& expected,
+      T desired,
+      SuccessMemoryOrder success,
+      FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_strong(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_strong(expected,
+                            desired,
+                            order,
+                            cmpexch_failure_memory_order<_MemoryOrder>(),
+                            MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_add(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_add(_ptr, arg, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_sub(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_sub(_ptr, arg, order, MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator+=(value_type arg) const noexcept {
+    atomic_add_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator-=(value_type arg) const noexcept {
+    atomic_sub_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+};
+
+// base class for atomic_ref<pointer-type>
+template <typename T, typename MemoryOrder, typename MemoryScope>
+struct basic_atomic_ref<T*, MemoryOrder, MemoryScope, false, false> {
+ private:
+  T** _ptr;
+
+ public:
+  using value_type = T*;
+  using difference_type = std::ptrdiff_t;
+
+  static constexpr bool is_always_lock_free = atomic_always_lock_free(sizeof(T));
+
+  static constexpr std::size_t required_alignment = alignof(T*);
+
+  basic_atomic_ref() = delete;
+  basic_atomic_ref& operator=(basic_atomic_ref const&) = delete;
+
+  explicit basic_atomic_ref(T*& arg) : _ptr(std::addressof(arg)) {}
+
+  basic_atomic_ref(basic_atomic_ref const&) = default;
+
+  T* operator=(T* desired) const noexcept {
+    this->store(desired);
+    return desired;
+  }
+
+  operator T*() const noexcept { return this->load(); }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION void store(T* desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    atomic_store(_ptr, desired, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T* load(_MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T* exchange(T* desired,
+                             _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, desired, order, MemoryScope());
+  }
+
+  DESUL_FUNCTION bool is_lock_free() const noexcept {
+    return atomic_is_lock_free<sizeof(T*), required_alignment>();
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(T*& expected,
+                                            T* desired,
+                                            SuccessMemoryOrder success,
+                                            FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_weak(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(
+      T*& expected, T* desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_weak(expected,
+                          desired,
+                          order,
+                          cmpexch_failure_memory_order<_MemoryOrder>(),
+                          MemoryScope());
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T*& expected,
+      T* desired,
+      SuccessMemoryOrder success,
+      FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_strong(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T*& expected, T* desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_strong(expected,
+                            desired,
+                            order,
+                            cmpexch_failure_memory_order<_MemoryOrder>(),
+                            MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_add(difference_type d, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_add(_ptr, _type_size(d), order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_sub(difference_type d, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_sub(_ptr, _type_size(d), order, MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator++() const noexcept {
+    return atomic_add_fetch(_ptr, _type_size(1), MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator++(int) const noexcept { return fetch_add(1); }
+
+  DESUL_FUNCTION value_type operator--() const noexcept {
+    return atomic_sub_fetch(_ptr, _type_size(1), MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator--(int) const noexcept { return fetch_sub(1); }
+
+  DESUL_FUNCTION value_type operator+=(difference_type d) const noexcept {
+    atomic_add_fetch(_ptr, _type_size(d), MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator-=(difference_type d) const noexcept {
+    atomic_sub_fetch(_ptr, _type_size(d), MemoryOrder(), MemoryScope());
+  }
+
+ private:
+  static constexpr std::ptrdiff_t _type_size(std::ptrdiff_t d) noexcept {
+    static_assert(std::is_object<T>{}, "");
+    return d * sizeof(T);
+  }
+};
+
+}  // namespace Impl
+
+template <typename T, typename MemoryOrder, typename MemoryScope>
+struct scoped_atomic_ref : Impl::basic_atomic_ref<T, MemoryOrder, MemoryScope> {
+  explicit scoped_atomic_ref(T& obj) noexcept
+      : Impl::basic_atomic_ref<T, MemoryOrder, MemoryScope>(obj) {}
+
+  scoped_atomic_ref& operator=(scoped_atomic_ref const&) = delete;
+
+  scoped_atomic_ref(scoped_atomic_ref const&) = default;
+
+  using Impl::basic_atomic_ref<T, MemoryOrder, MemoryScope>::operator=;
+};
+
+}  // namespace desul
+
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/CUDA.hpp b/packages/kokkos/core/src/desul/atomics/CUDA.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..32873a59776b07dea770c193e0034c1e82387246
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/CUDA.hpp
@@ -0,0 +1,453 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_CUDA_HPP_
+#define DESUL_ATOMICS_CUDA_HPP_
+
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+// When building with clang we need to include the device functions always
+// since clang must see a consistent overload set in both device and host compilation
+// but that means we need to know on the host what to make visible, i.e. we need
+// a host side compile knowledge of architecture.
+// We simply can say DESUL proper doesn't support clang CUDA build pre Volta,
+// Kokkos has that knowledge and so I use it here, allowing in Kokkos to use
+// clang with pre Volta as CUDA compiler
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__>=700)) || \
+    (!defined(__NVCC__) && !defined(KOKKOS_ARCH_KEPLER) && !defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL))
+#define DESUL_HAVE_CUDA_ATOMICS_ASM
+#include <desul/atomics/cuda/CUDA_asm.hpp>
+#endif
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__<700)) || \
+    (!defined(__NVCC__) && !defined(DESUL_HAVE_CUDA_ATOMICS_ASM))
+namespace desul {
+namespace Impl {
+template<class T>
+struct is_cuda_atomic_integer_type {
+  static constexpr bool value = std::is_same<T,int>::value ||
+                                std::is_same<T,unsigned int>::value ||
+                                std::is_same<T,unsigned long long int>::value;
+};
+
+template<class T>
+struct is_cuda_atomic_add_type {
+  static constexpr bool value = is_cuda_atomic_integer_type<T>::value ||
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
+                                std::is_same<T,double>::value || 
+#endif
+                                std::is_same<T,float>::value;
+};
+
+template<class T>
+struct is_cuda_atomic_sub_type {
+  static constexpr bool value = std::is_same<T,int>::value ||
+                                std::is_same<T,unsigned int>::value;
+};
+} // Impl
+
+// Atomic Add
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_add_type<T>::value,T>::type
+atomic_fetch_add(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicAdd(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_add_type<T>::value,T>::type
+atomic_fetch_add(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicAdd(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_add_type<T>::value,T>::type
+atomic_fetch_add(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_add(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+
+// Atomic Sub
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_sub_type<T>::value,T>::type
+atomic_fetch_sub(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicSub(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_sub_type<T>::value,T>::type
+atomic_fetch_sub(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicSub(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_sub_type<T>::value,T>::type
+atomic_fetch_sub(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_sub(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+// Atomic Inc
+__device__ inline
+unsigned int atomic_fetch_inc(unsigned int* dest, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicInc(dest,val);
+}
+
+template<class MemoryOrder>
+__device__ inline
+unsigned int atomic_fetch_inc(unsigned int* dest, unsigned int val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  unsigned int return_val = atomicInc(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class MemoryOrder>
+__device__ inline
+unsigned int atomic_fetch_inc(unsigned int* dest, unsigned int val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_inc(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+// Atomic Inc
+__device__ inline
+unsigned int atomic_fetch_dec(unsigned int* dest, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicDec(dest,val);
+}
+
+template<class MemoryOrder>
+__device__ inline
+unsigned int atomic_fetch_dec(unsigned int* dest, unsigned int val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  unsigned int return_val = atomicDec(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class MemoryOrder>
+__device__ inline
+unsigned int atomic_fetch_dec(unsigned int* dest, unsigned int val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_dec(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+
+// Atomic Max
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_max(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicMax(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_max(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicMax(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_max(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_max(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+// Atomic Min
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_min(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicMin(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_min(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicMin(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_min(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_min(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+// Atomic And
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_and(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicAnd(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_and(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicAnd(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_and(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_and(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+// Atomic XOR
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_xor(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicXor(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_xor(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicXor(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_xor(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_xor(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+// Atomic OR
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_or(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicOr(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_or(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicOr(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_or(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_or(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+} // desul
+#endif
+
+#if !defined(__NVCC__)
+// Functions defined as device functions in CUDA which don't exist in the GCC overload set
+namespace desul {
+
+#if defined(DESUL_HAVE_CUDA_ATOMICS_ASM)
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(TYPE,ORDER,SCOPE) \
+    inline void atomic_add(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+    (void) atomic_fetch_add(dest, val, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(int32_t,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(long,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
+  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(unsigned long long,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(float,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(double,MemoryOrderRelaxed,MemoryScopeDevice);
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(TYPE,ORDER,SCOPE) \
+    inline void atomic_sub(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+    (void) atomic_fetch_sub(dest, val, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(int32_t,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(long,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
+  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(float,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(double,MemoryOrderRelaxed,MemoryScopeDevice);
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_INC(TYPE,ORDER,SCOPE) \
+    inline void atomic_inc(TYPE* const dest, ORDER order, SCOPE scope) { \
+    (void) atomic_fetch_inc(dest, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_INC(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_DEC(TYPE,ORDER,SCOPE) \
+    inline void atomic_dec(TYPE* const dest, ORDER order, SCOPE scope) { \
+    (void) atomic_fetch_dec(dest, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_DEC(unsigned,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
+#endif // DESUL_HAVE_CUDA_ATOMICS_ASM
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_ADD(TYPE,ORDER,SCOPE) \
+    inline TYPE atomic_fetch_add(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+      return Impl::atomic_fetch_oper(Impl::AddOper<TYPE, const TYPE>(),dest, val, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_ADD(float,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_ADD(double,MemoryOrderRelaxed,MemoryScopeDevice);
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_SUB(TYPE,ORDER,SCOPE) \
+    inline TYPE atomic_fetch_sub(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+      return Impl::atomic_fetch_oper(Impl::SubOper<TYPE, const TYPE>(),dest, val, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_SUB(float,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_SUB(double,MemoryOrderRelaxed,MemoryScopeDevice);
+
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(TYPE,ORDER,SCOPE) \
+    inline TYPE atomic_fetch_max(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+      return Impl::atomic_fetch_oper(Impl::MaxOper<TYPE, const TYPE>(), dest, val, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(int,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(long,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(unsigned long,MemoryOrderRelaxed,MemoryScopeDevice);
+//  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(unsigned long long,MemoryOrderRelaxed,MemoryScopeDevice);
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(TYPE,ORDER,SCOPE) \
+    inline TYPE atomic_fetch_min(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+      return Impl::atomic_fetch_oper(Impl::MinOper<TYPE, const TYPE>(), dest, val, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(int,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(long,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(unsigned long,MemoryOrderRelaxed,MemoryScopeDevice);
+//  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(unsigned long long,MemoryOrderRelaxed,MemoryScopeDevice);
+//  inline void atomic_fetch_max(int32_t* const dest, int32_t val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+
+}
+
+// Functions defined int the GCC overload set but not in the device overload set
+namespace desul {
+  __device__ inline
+  unsigned long long atomic_fetch_add(unsigned long long* const dest, unsigned long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::AddOper<unsigned long long, const unsigned long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_fetch_add(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::AddOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_add(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::AddOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_fetch_sub(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::SubOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_sub(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::SubOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_max(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::MaxOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_min(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::MinOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_or(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::OrOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_fetch_or(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::OrOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_xor(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::XorOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_fetch_xor(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::XorOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_and(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::AndOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_fetch_and(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::AndOper<long long, const long long>(), dest, val, order, scope);
+  }
+
+
+  __device__ inline
+  unsigned long long atomic_add_fetch(unsigned long long* const dest, unsigned long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::AddOper<unsigned long long, const unsigned long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_add_fetch(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::AddOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_add_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::AddOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_sub_fetch(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::SubOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_sub_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::SubOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_or_fetch(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::OrOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_or_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::OrOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_xor_fetch(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::XorOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_xor_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::XorOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_and_fetch(long long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::AndOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_and_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::AndOper<long, const long>(), dest, val, order, scope);
+  }
+}
+#endif
+#endif  // DESUL_HAVE_CUDA_ATOMICS
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Common.hpp b/packages/kokkos/core/src/desul/atomics/Common.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f1dccc6c52318f58b6fb1ed792ed614a8351458c
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Common.hpp
@@ -0,0 +1,199 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMMON_HPP_
+#define DESUL_ATOMICS_COMMON_HPP_
+#include "desul/atomics/Macros.hpp"
+#include <cstdint>
+#include <atomic>
+#include <type_traits>
+
+namespace desul {
+struct alignas(16) Dummy16ByteValue {
+  int64_t value1;
+  int64_t value2;
+  bool operator!=(Dummy16ByteValue v) const {
+    return (value1 != v.value1) || (value2 != v.value2);
+  }
+  bool operator==(Dummy16ByteValue v) const {
+    return (value1 == v.value1) && (value2 == v.value2);
+  }
+};
+}  // namespace desul
+
+// MemoryOrder Tags
+
+namespace desul {
+// Memory order sequential consistent
+struct MemoryOrderSeqCst {};
+// Memory order acquire release
+struct MemoryOrderAcqRel {};
+// Memory order acquire
+struct MemoryOrderAcquire {};
+// Memory order release
+struct MemoryOrderRelease {};
+// Memory order relaxed
+struct MemoryOrderRelaxed {};
+}  // namespace desul
+
+// Memory Scope Tags
+
+namespace desul {
+// Entire machine scope (e.g. for global arrays)
+struct MemoryScopeSystem {};
+// Node level
+struct MemoryScopeNode {};
+// Device or socket scope (i.e. a CPU socket, a single GPU)
+struct MemoryScopeDevice {};
+// Core scoped (i.e. a shared Level 1 cache)
+struct MemoryScopeCore {};
+}  // namespace desul
+
+#ifndef __ATOMIC_RELAXED
+#define __ATOMIC_RELAXED 0
+#define __ATOMIC_CONSUME 1
+#define __ATOMIC_ACQUIRE 2
+#define __ATOMIC_RELEASE 3
+#define __ATOMIC_ACQ_REL 4
+#define __ATOMIC_SEQ_CST 5
+#endif
+
+namespace desul {
+template <class MemoryOrderDesul>
+struct GCCMemoryOrder;
+
+template <>
+struct GCCMemoryOrder<MemoryOrderRelaxed> {
+  static constexpr int value = __ATOMIC_RELAXED;
+};
+
+template <>
+struct GCCMemoryOrder<MemoryOrderAcquire> {
+  static constexpr int value = __ATOMIC_ACQUIRE;
+};
+
+template <>
+struct GCCMemoryOrder<MemoryOrderRelease> {
+  static constexpr int value = __ATOMIC_RELEASE;
+};
+
+template <>
+struct GCCMemoryOrder<MemoryOrderAcqRel> {
+  static constexpr int value = __ATOMIC_ACQ_REL;
+};
+
+template <>
+struct GCCMemoryOrder<MemoryOrderSeqCst> {
+  static constexpr int value = __ATOMIC_SEQ_CST;
+};
+
+template <class MemoryOrderDesul>
+struct CXXMemoryOrder;
+
+template <>
+struct CXXMemoryOrder<MemoryOrderRelaxed> {
+  static constexpr std::memory_order value = std::memory_order_relaxed;
+};
+
+template <>
+struct CXXMemoryOrder<MemoryOrderAcquire> {
+  static constexpr std::memory_order value = std::memory_order_acquire;
+};
+
+template <>
+struct CXXMemoryOrder<MemoryOrderRelease> {
+  static constexpr std::memory_order value = std::memory_order_release;
+};
+
+template <>
+struct CXXMemoryOrder<MemoryOrderAcqRel> {
+  static constexpr std::memory_order value = std::memory_order_acq_rel;
+};
+
+template <>
+struct CXXMemoryOrder<MemoryOrderSeqCst> {
+  static constexpr std::memory_order value = std::memory_order_seq_cst;
+};
+
+namespace Impl {
+template <typename MemoryOrder>
+struct CmpExchFailureOrder {
+  using memory_order = std::conditional_t<
+      std::is_same<MemoryOrder, MemoryOrderAcqRel>{},
+      MemoryOrderAcquire,
+      std::conditional_t<std::is_same<MemoryOrder, MemoryOrderRelease>{},
+                         MemoryOrderRelaxed,
+                         MemoryOrder>>;
+};
+template <typename MemoryOrder>
+using cmpexch_failure_memory_order =
+    typename CmpExchFailureOrder<MemoryOrder>::memory_order;
+}  // namespace Impl
+
+}
+
+// We should in principle use std::numeric_limits, but that requires constexpr function support on device
+// Currently that is still considered experimetal on CUDA and sometimes not reliable.
+namespace desul {
+namespace Impl {
+template<class T>
+struct numeric_limits_max;
+
+template<>
+struct numeric_limits_max<uint32_t> {
+  static constexpr uint32_t value = 0xffffffffu;
+};
+template<>
+struct numeric_limits_max<uint64_t> {
+  static constexpr uint64_t value = 0xfffffffflu;
+};
+
+constexpr bool atomic_always_lock_free(std::size_t size) {
+  return size == 4 || size == 8
+#if defined(DESUL_HAVE_16BYTE_COMPARE_AND_SWAP)
+         || size == 16
+#endif
+      ;
+}
+
+template <std::size_t Size, std::size_t Align>
+DESUL_INLINE_FUNCTION bool atomic_is_lock_free() noexcept {
+  return Size == 4 || Size == 8
+#if defined(DESUL_HAVE_16BYTE_COMPARE_AND_SWAP)
+         || Size == 16
+#endif
+      ;
+}
+
+template<std::size_t N>
+struct atomic_compare_exchange_type;
+
+template<>
+struct atomic_compare_exchange_type<4> {
+  using type = int32_t;
+};
+
+template<>
+struct atomic_compare_exchange_type<8> {
+  using type = int64_t;
+};
+
+template<>
+struct atomic_compare_exchange_type<16> {
+  using type = Dummy16ByteValue;
+};
+
+template<class T>
+struct dont_deduce_this_parameter { using type = T; };
+
+template<class T>
+using dont_deduce_this_parameter_t = typename dont_deduce_this_parameter<T>::type;
+
+}
+}
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange.hpp b/packages/kokkos/core/src/desul/atomics/Compare_Exchange.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7b8289d75b8e70a1097207418a5a0f435913cded
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Compare_Exchange.hpp
@@ -0,0 +1,35 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_HPP_
+
+#include "desul/atomics/Macros.hpp"
+
+#ifdef DESUL_HAVE_GCC_ATOMICS
+#include "desul/atomics/Compare_Exchange_GCC.hpp"
+#endif
+#ifdef DESUL_HAVE_MSVC_ATOMICS
+#include "desul/atomics/Compare_Exchange_MSVC.hpp"
+#endif
+#ifdef DESUL_HAVE_SERIAL_ATOMICS
+#include "desul/atomics/Compare_Exchange_Serial.hpp"
+#endif
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+#include "desul/atomics/Compare_Exchange_CUDA.hpp"
+#endif
+#ifdef DESUL_HAVE_HIP_ATOMICS
+#include "desul/atomics/Compare_Exchange_HIP.hpp"
+#endif
+#ifdef DESUL_HAVE_OPENMP_ATOMICS
+#include "desul/atomics/Compare_Exchange_OpenMP.hpp"
+#endif
+#ifdef DESUL_HAVE_SYCL_ATOMICS
+#include "desul/atomics/Compare_Exchange_SYCL.hpp"
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_CUDA.hpp b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_CUDA.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..aab0d943eb659f9c0f860fef1293753bcf5c52be
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_CUDA.hpp
@@ -0,0 +1,267 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_CUDA_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_CUDA_HPP_
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/Lock_Array_Cuda.hpp"
+
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+namespace desul {
+// Only include if compiling device code, or the CUDA compiler is not NVCC (i.e. Clang)
+// atomic_thread_fence implementation
+#if defined(__CUDA_ARCH__) || !defined(__NVCC__)
+__device__ inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeDevice) {
+  __threadfence();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeDevice) {
+  __threadfence();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeDevice) {
+  __threadfence();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeDevice) {
+  __threadfence();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeCore) {
+  __threadfence_block();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeCore) {
+  __threadfence_block();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeCore) {
+  __threadfence_block();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeCore) {
+  __threadfence_block();
+}
+#if (__CUDA_ARCH__>=600) || !defined(__NVCC__)
+__device__ inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeNode) {
+  __threadfence_system();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeNode) {
+  __threadfence_system();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeNode) {
+  __threadfence_system();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeNode) {
+  __threadfence_system();
+}
+#endif
+#endif
+}
+
+// Compare Exchange for PRE Volta, not supported with CLANG as CUDA compiler, since we do NOT have a way
+// of having the code included for clang only when the CC is smaller than 700
+// But on Clang the device side symbol list must be independent of __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700) || \
+(!defined(__NVCC__) && (defined(KOKKOS_ENABLE_KEPLER) || defined(KOKKOS_ENABLE_MAXWELL) || defined(KOKKOS_ENABLE_PASCAL)))
+namespace desul {
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4, "this function assumes an unsigned int is 32-bit");
+  unsigned int return_val = atomicCAS(reinterpret_cast<unsigned int*>(dest),
+                                      reinterpret_cast<unsigned int&>(compare),
+                                      reinterpret_cast<unsigned int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8, "this function assumes an unsigned long long  is 64-bit");
+  unsigned long long int return_val =
+      atomicCAS(reinterpret_cast<unsigned long long int*>(dest),
+                reinterpret_cast<unsigned long long int&>(compare),
+                reinterpret_cast<unsigned long long int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelease, MemoryScope) {
+  T return_val = atomic_compare_exchange(dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderAcquire, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_compare_exchange(dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderAcqRel, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_compare_exchange(dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4, "this function assumes an unsigned int is 32-bit");
+  unsigned int return_val = atomicExch(reinterpret_cast<unsigned int*>(dest),
+                                       reinterpret_cast<unsigned int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8, "this function assumes an unsigned long long  is 64-bit");
+  unsigned long long int return_val =
+      atomicExch(reinterpret_cast<unsigned long long int*>(dest),
+                 reinterpret_cast<unsigned long long int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderRelease, MemoryScope) {
+  T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderAcquire, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderAcqRel, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return reinterpret_cast<T&>(return_val);
+}
+}  // namespace desul
+#endif
+
+// Including CUDA ptx based exchange atomics
+// When building with clang we need to include the device functions always
+// since clang must see a consistent overload set in both device and host compilation
+// but that means we need to know on the host what to make visible, i.e. we need
+// a host side compile knowledge of architecture.
+// We simply can say DESUL proper doesn't support clang CUDA build pre Volta,
+// Kokkos has that knowledge and so I use it here, allowing in Kokkos to use
+// clang with pre Volta as CUDA compiler
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__>=700)) || \
+     (!defined(__NVCC__) && !defined(KOKKOS_ARCH_KEPLER) && !defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL))
+#include <desul/atomics/cuda/CUDA_asm_exchange.hpp>
+#endif
+
+// SeqCst is not directly supported by PTX, need the additional fences:
+
+#if defined(__CUDA_ARCH__) || !defined(__NVCC__)
+namespace desul {
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderSeqCst, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_exchange(dest,value,MemoryOrderRelaxed(),MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderSeqCst, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_exchange(dest,value,MemoryOrderRelaxed(),MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_compare_exchange(dest,compare,value,MemoryOrderRelaxed(),MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_compare_exchange(dest,compare,value,MemoryOrderRelaxed(),MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+}
+#endif
+
+#if defined(__CUDA_ARCH__) || !defined(__NVCC__)
+namespace desul {
+template <typename T, class MemoryOrder, class MemoryScope>
+__device__ typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrder, MemoryScope scope) {
+  // This is a way to avoid dead lock in a warp or wave front
+  T return_val;
+  int done = 0;
+  unsigned int mask = DESUL_IMPL_ACTIVEMASK;
+  unsigned int active = DESUL_IMPL_BALLOT_MASK(mask, 1);
+  unsigned int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_cuda((void*)dest, scope)) {
+        if(std::is_same<MemoryOrder,MemoryOrderSeqCst>::value) atomic_thread_fence(MemoryOrderRelease(),scope);
+        atomic_thread_fence(MemoryOrderAcquire(),scope);
+        return_val = *dest;
+        if(return_val == compare) {
+          *dest = value;
+          atomic_thread_fence(MemoryOrderRelease(),scope);
+        }
+        Impl::unlock_address_cuda((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(mask, done);
+  }
+  return return_val;
+}
+template <typename T, class MemoryOrder, class MemoryScope>
+__device__ typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrder, MemoryScope scope) {
+  // This is a way to avoid dead lock in a warp or wave front
+  T return_val;
+  int done = 0;
+  unsigned int mask = DESUL_IMPL_ACTIVEMASK;
+  unsigned int active = DESUL_IMPL_BALLOT_MASK(mask, 1);
+  unsigned int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_cuda((void*)dest, scope)) {
+        if(std::is_same<MemoryOrder,MemoryOrderSeqCst>::value) atomic_thread_fence(MemoryOrderRelease(),scope);
+        atomic_thread_fence(MemoryOrderAcquire(),scope);
+        return_val = *dest;
+        *dest = value;
+        atomic_thread_fence(MemoryOrderRelease(),scope);
+        Impl::unlock_address_cuda((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(mask, done);
+  }
+  return return_val;
+}
+}
+#endif
+
+
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_GCC.hpp b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_GCC.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..418bea0b8b72f42883cc582bd58a5a170f738fea
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_GCC.hpp
@@ -0,0 +1,91 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_GCC_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_GCC_HPP_
+#include "desul/atomics/Common.hpp"
+
+#ifdef DESUL_HAVE_GCC_ATOMICS
+#if !defined(DESUL_HAVE_16BYTE_COMPARE_AND_SWAP) && !defined(__CUDACC__)
+// This doesn't work in WSL??
+//#define DESUL_HAVE_16BYTE_COMPARE_AND_SWAP
+#endif
+namespace desul {
+
+namespace Impl {
+template<class T>
+struct atomic_exchange_available_gcc {
+  constexpr static bool value =
+#ifndef DESUL_HAVE_LIBATOMIC
+    ((sizeof(T)==4 && alignof(T)==4) ||
+#ifdef DESUL_HAVE_16BYTE_COMPARE_AND_SWAP
+     (sizeof(T)==16 && alignof(T)==16) ||
+#endif
+     (sizeof(T)==8 && alignof(T)==8)) &&
+#endif
+    std::is_trivially_copyable<T>::value;
+};
+} //namespace Impl
+
+#if defined(__clang__) && (__clang_major__>=7) && !defined(__APPLE__)
+// Disable warning for large atomics on clang 7 and up (checked with godbolt)
+// error: large atomic operation may incur significant performance penalty [-Werror,-Watomic-alignment]
+// https://godbolt.org/z/G7YhqhbG6
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Watomic-alignment"
+#endif
+template<class MemoryOrder, class MemoryScope>
+void atomic_thread_fence(MemoryOrder, MemoryScope) {
+  __atomic_thread_fence(GCCMemoryOrder<MemoryOrder>::value);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+std::enable_if_t<Impl::atomic_exchange_available_gcc<T>::value, T>
+atomic_exchange(
+    T* dest, T value, MemoryOrder, MemoryScope) {
+  T return_val;
+  __atomic_exchange(
+     dest, &value, &return_val, GCCMemoryOrder<MemoryOrder>::value);
+  return return_val;
+}
+
+// Failure mode for atomic_compare_exchange_n cannot be RELEASE nor ACQREL so
+// Those two get handled separatly.
+template <typename T, class MemoryOrder, class MemoryScope>
+std::enable_if_t<Impl::atomic_exchange_available_gcc<T>::value, T>
+atomic_compare_exchange(
+    T* dest, T compare, T value, MemoryOrder, MemoryScope) {
+  (void)__atomic_compare_exchange(
+      dest, &compare, &value, false, GCCMemoryOrder<MemoryOrder>::value, GCCMemoryOrder<MemoryOrder>::value);
+  return compare;
+}
+
+template <typename T, class MemoryScope>
+std::enable_if_t<Impl::atomic_exchange_available_gcc<T>::value, T>
+atomic_compare_exchange(
+    T* dest, T compare, T value, MemoryOrderRelease, MemoryScope) {
+  (void)__atomic_compare_exchange(
+      dest, &compare, &value, false, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
+  return compare;
+}
+
+template <typename T, class MemoryScope>
+std::enable_if_t<Impl::atomic_exchange_available_gcc<T>::value, T>
+atomic_compare_exchange(
+    T* dest, T compare, T value, MemoryOrderAcqRel, MemoryScope) {
+  (void)__atomic_compare_exchange(
+      dest, &compare, &value, false, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE);
+  return compare;
+}
+
+#if defined(__clang__) && (__clang_major__>=7) && !defined(__APPLE__)
+#pragma GCC diagnostic pop
+#endif
+}  // namespace desul
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_HIP.hpp b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_HIP.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d6bf04a7e6d25449934f4813c936bf37ce9bb07b
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_HIP.hpp
@@ -0,0 +1,253 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_HIP_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_HIP_HPP_
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/Lock_Array_HIP.hpp"
+
+#ifdef DESUL_HAVE_HIP_ATOMICS
+namespace desul {
+#if defined(__HIP_DEVICE_COMPILE__)
+inline __device__ void atomic_thread_fence(MemoryOrderRelease, MemoryScopeDevice) {
+  __threadfence();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeDevice) {
+  __threadfence();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeDevice) {
+  __threadfence();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeDevice) {
+  __threadfence();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderRelease, MemoryScopeCore) {
+  __threadfence_block();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeCore) {
+  __threadfence_block();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeCore) {
+  __threadfence_block();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeCore) {
+  __threadfence_block();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderRelease, MemoryScopeNode) {
+  __threadfence_system();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeNode) {
+  __threadfence_system();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeNode) {
+  __threadfence_system();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeNode) {
+  __threadfence_system();
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4,
+                "this function assumes an unsigned int is 32-bit");
+  unsigned int return_val = atomicCAS(reinterpret_cast<unsigned int*>(dest),
+                                      reinterpret_cast<unsigned int&>(compare),
+                                      reinterpret_cast<unsigned int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8,
+                "this function assumes an unsigned long long  is 64-bit");
+  unsigned long long int return_val =
+      atomicCAS(reinterpret_cast<unsigned long long int*>(dest),
+                reinterpret_cast<unsigned long long int&>(compare),
+                reinterpret_cast<unsigned long long int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelease, MemoryScope) {
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderAcquire, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderAcqRel, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4,
+                "this function assumes an unsigned int is 32-bit");
+  unsigned int return_val = atomicExch(reinterpret_cast<unsigned int*>(dest),
+                                       reinterpret_cast<unsigned int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8,
+                "this function assumes an unsigned long long  is 64-bit");
+  unsigned long long int return_val =
+      atomicExch(reinterpret_cast<unsigned long long int*>(dest),
+                 reinterpret_cast<unsigned long long int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_exchange(T* const dest, T compare, T value, MemoryOrderRelease, MemoryScope) {
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_exchange(
+    T* const dest, T /*compare*/, T value, MemoryOrderAcquire, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_exchange(T* const dest, T value, MemoryOrderAcqRel, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_exchange(T* const dest, T value, MemoryOrderSeqCst, MemoryScope) {
+          atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+            T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+              atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+                return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION __device__
+    typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type
+    atomic_compare_exchange(
+        T* const dest, T compare, T value, MemoryOrder, MemoryScope scope) {
+  // This is a way to avoid dead lock in a warp or wave front
+  T return_val;
+  int done = 0;
+  unsigned long long int active = DESUL_IMPL_BALLOT_MASK(1);
+  unsigned long long int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_hip((void*)dest, scope)) {
+        if (std::is_same<MemoryOrder, MemoryOrderSeqCst>::value)
+          atomic_thread_fence(MemoryOrderRelease(), scope);
+        atomic_thread_fence(MemoryOrderAcquire(), scope);
+        return_val = *dest;
+        if (return_val == compare) {
+          *dest = value;
+          atomic_thread_fence(MemoryOrderRelease(), scope);
+        }
+        Impl::unlock_address_hip((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(done);
+  }
+  return return_val;
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION __device__
+    typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type
+    atomic_exchange(T* const dest, T value, MemoryOrder, MemoryScope scope) {
+  // This is a way to avoid dead lock in a warp or wave front
+  T return_val;
+  int done = 0;
+  unsigned long long int active = DESUL_IMPL_BALLOT_MASK(1);
+  unsigned long long int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_hip((void*)dest, scope)) {
+        if (std::is_same<MemoryOrder, MemoryOrderSeqCst>::value)
+          atomic_thread_fence(MemoryOrderRelease(), scope);
+        atomic_thread_fence(MemoryOrderAcquire(), scope);
+        return_val = *dest;
+        *dest = value;
+        atomic_thread_fence(MemoryOrderRelease(), scope);
+        Impl::unlock_address_hip((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(done);
+  }
+  return return_val;
+}
+#endif
+}  // namespace desul
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_MSVC.hpp b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_MSVC.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c96cb031714f63b5039ade535077c7511838ffbd
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_MSVC.hpp
@@ -0,0 +1,201 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_MSVC_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_MSVC_HPP_
+#include "desul/atomics/Common.hpp"
+#include <type_traits>
+#ifdef DESUL_HAVE_MSVC_ATOMICS
+
+#ifndef DESUL_HAVE_16BYTE_COMPARE_AND_SWAP
+#define DESUL_HAVE_16BYTE_COMPARE_AND_SWAP
+#endif
+
+namespace desul {
+
+template<class T, class MemoryOrder, class MemoryScope>
+T atomic_exchange(T* const, T val, MemoryOrder, MemoryScope) { return val;}
+
+
+template<class MemoryOrder, class MemoryScope>
+void atomic_thread_fence(MemoryOrder, MemoryScope) {
+  std::atomic_thread_fence(CXXMemoryOrder<MemoryOrder>::value);
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 1, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderRelaxed, MemoryScope) {
+  char return_val =
+      _InterlockedExchange8((char*)dest, *((char*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 2, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderRelaxed, MemoryScope) {
+  short return_val =
+      _InterlockedExchange16((short*)dest, *((short*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderRelaxed, MemoryScope) {
+  long return_val =
+      _InterlockedExchange((long*)dest, *((long*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderRelaxed, MemoryScope) {
+  __int64 return_val = _InterlockedExchange64(
+      (__int64*)dest, *((__int64*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 1, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderSeqCst, MemoryScope) {
+  char return_val =
+      _InterlockedExchange8((char*)dest, *((char*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 2, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderSeqCst, MemoryScope) {
+  short return_val =
+      _InterlockedExchange16((short*)dest, *((short*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderSeqCst, MemoryScope) {
+  long return_val =
+      _InterlockedExchange((long*)dest, *((long*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderSeqCst, MemoryScope) {
+  __int64 return_val = _InterlockedExchange64(
+      (__int64*)dest, *((__int64*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 1, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderRelaxed, MemoryScope) {
+  char return_val =
+      _InterlockedCompareExchange8((char*)dest, *((char*)&val), *((char*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 2, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderRelaxed, MemoryScope) {
+  short return_val =
+      _InterlockedCompareExchange16((short*)dest, *((short*)&val), *((short*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderRelaxed, MemoryScope) {
+  long return_val =
+      _InterlockedCompareExchange((long*)dest, *((long*)&val), *((long*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderRelaxed, MemoryScope) {
+  __int64 return_val = _InterlockedCompareExchange64(
+      (__int64*)dest, *((__int64*)&val), *((__int64*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 16, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderRelaxed, MemoryScope) {
+  Dummy16ByteValue* val16 = reinterpret_cast<Dummy16ByteValue*>(&val);
+  (void)_InterlockedCompareExchange128(reinterpret_cast<__int64*>(dest),
+                                       val16->value2,
+                                       val16->value1,
+                                       (reinterpret_cast<__int64*>(&compare)));
+  return compare;
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 1, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderSeqCst, MemoryScope) {
+  char return_val =
+      _InterlockedCompareExchange8((char*)dest, *((char*)&val), *((char*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 2, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderSeqCst, MemoryScope) {
+  short return_val =
+      _InterlockedCompareExchange16((short*)dest, *((short*)&val), *((short*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderSeqCst, MemoryScope) {
+  long return_val =
+      _InterlockedCompareExchange((long*)dest, *((long*)&val), *((long*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderSeqCst, MemoryScope) {
+  __int64 return_val = _InterlockedCompareExchange64(
+      (__int64*)dest, *((__int64*)&val), *((__int64*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 16, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderSeqCst, MemoryScope) {
+  Dummy16ByteValue* val16 = reinterpret_cast<Dummy16ByteValue*>(&val);
+  (void)_InterlockedCompareExchange128(reinterpret_cast<__int64*>(dest),
+                                       val16->value2,
+                                       val16->value1,
+                                       (reinterpret_cast<__int64*>(&compare)));
+  return compare;
+}
+
+
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<(sizeof(T) != 1 && sizeof(T) != 4 && sizeof(T) != 8 && sizeof(T) != 16), T>::type atomic_compare_exchange(
+     T* const dest, T compare, T val, MemoryOrder, MemoryScope scope) {
+  while (!Impl::lock_address((void*)dest, scope)) {}
+  if (std::is_same<MemoryOrder, MemoryOrderSeqCst>::value)
+          atomic_thread_fence(MemoryOrderRelease(), scope);
+  atomic_thread_fence(MemoryOrderAcquire(),scope);
+  T return_val = *dest;
+  if(return_val == compare) {
+    *dest = val;
+    atomic_thread_fence(MemoryOrderRelease(),scope);
+  }
+
+  Impl::unlock_address((void*)dest, scope);
+  return return_val;
+}
+
+}  // namespace desul
+
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_OpenMP.hpp b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_OpenMP.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a1d1c9124991d01640ca70243e9033e4c528e6cf
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_OpenMP.hpp
@@ -0,0 +1,145 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_OPENMP_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_OPENMP_HPP_
+#include "desul/atomics/Common.hpp"
+#include <cstdio>
+#include <omp.h>
+
+namespace desul
+{
+namespace Impl
+{
+static constexpr bool omp_on_host() { return true; }
+
+#pragma omp begin declare variant match(device = {kind(host)})
+static constexpr bool omp_on_host() { return true; }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {kind(nohost)})
+static constexpr bool omp_on_host() { return false; }
+#pragma omp end declare variant
+} // namespace Impl
+} // namespace desul
+
+#ifdef DESUL_HAVE_OPENMP_ATOMICS
+namespace desul {
+
+#if _OPENMP > 201800
+// atomic_thread_fence for Core Scope
+inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeCore) {
+  // There is no seq_cst flush in OpenMP, isn't it the same anyway for fence?
+  #pragma omp flush acq_rel
+}
+inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeCore) {
+  #pragma omp flush acq_rel
+}
+inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeCore) {
+  #pragma omp flush release
+}
+inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeCore) {
+  #pragma omp flush acquire
+}
+// atomic_thread_fence for Device Scope
+inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeDevice) {
+  // There is no seq_cst flush in OpenMP, isn't it the same anyway for fence?
+  #pragma omp flush acq_rel
+}
+inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeDevice) {
+  #pragma omp flush acq_rel
+}
+inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeDevice) {
+  #pragma omp flush release
+}
+inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeDevice) {
+  #pragma omp flush acquire
+}
+#else
+// atomic_thread_fence for Core Scope
+inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeCore) {
+  #pragma omp flush
+}
+inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeCore) {
+  #pragma omp flush
+}
+inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeCore) {
+  #pragma omp flush
+}
+inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeCore) {
+  #pragma omp flush
+}
+// atomic_thread_fence for Device Scope
+inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeDevice) {
+  #pragma omp flush
+}
+inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeDevice) {
+  #pragma omp flush
+}
+inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeDevice) {
+  #pragma omp flush
+}
+inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeDevice) {
+  #pragma omp flush
+}
+#endif
+
+template <typename T, class MemoryOrder, class MemoryScope>
+T atomic_exchange(
+    T* dest, T value, MemoryOrder, MemoryScope) {
+  T return_val;
+  if(!std::is_same<MemoryOrder,MemoryOrderRelaxed>::value)
+    atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T& x = *dest;
+  #pragma omp atomic capture
+  { return_val = x; x = value; }
+  if(!std::is_same<MemoryOrder,MemoryOrderRelaxed>::value)
+    atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+
+// OpenMP doesn't have compare exchange, so we use build-ins and rely on testing that this works
+// Note that means we test this in OpenMPTarget offload regions!
+template <typename T, class MemoryOrder, class MemoryScope>
+std::enable_if_t<Impl::atomic_always_lock_free(sizeof(T)),T> atomic_compare_exchange(
+    T* dest, T compare, T value, MemoryOrder, MemoryScope) {
+  using cas_t = typename Impl::atomic_compare_exchange_type<sizeof(T)>::type;
+  cas_t retval = __sync_val_compare_and_swap(
+     reinterpret_cast<volatile cas_t*>(dest), 
+     reinterpret_cast<cas_t&>(compare), 
+     reinterpret_cast<cas_t&>(value));
+  return reinterpret_cast<T&>(retval);
+}
+// Make 16 byte cas work on host at least (is_initial_device check, note this requires C++17)
+#if __cplusplus>=201703L
+
+#if defined(__clang__) && (__clang_major__>=7)
+// Disable warning for large atomics on clang 7 and up (checked with godbolt)
+// error: large atomic operation may incur significant performance penalty [-Werror,-Watomic-alignment]
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Watomic-alignment"
+#endif
+
+template <typename T, class MemoryOrder, class MemoryScope>
+std::enable_if_t<!Impl::atomic_always_lock_free(sizeof(T)) && (sizeof(T)==16),T> atomic_compare_exchange(
+    T* dest, T compare, T value, MemoryOrder, MemoryScope) {
+  if constexpr (desul::Impl::omp_on_host()) {
+    (void)__atomic_compare_exchange(
+      dest, &compare, &value, false, GCCMemoryOrder<MemoryOrder>::value, GCCMemoryOrder<MemoryOrder>::value);
+    return compare;
+  } else {
+    return value;
+  }
+}
+#if defined(__clang__) && (__clang_major__>=7)
+#pragma GCC diagnostic pop
+#endif
+#endif
+
+}  // namespace desul
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_SYCL.hpp b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_SYCL.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a8fd2ebbe2beef39e4cd8dff5797b722e8d17582
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_SYCL.hpp
@@ -0,0 +1,102 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_SYCL_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_SYCL_HPP_
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/SYCLConversions.hpp"
+#include <CL/sycl.hpp>
+
+
+#ifdef DESUL_HAVE_SYCL_ATOMICS
+
+namespace desul {
+
+template<class MemoryOrder, class MemoryScope>
+inline void atomic_thread_fence(MemoryOrder, MemoryScope) {
+  DESUL_SYCL_NAMESPACE::atomic_fence(DesulToSYCLMemoryOrder<MemoryOrder>::value,
+                                     DesulToSYCLMemoryScope<MemoryScope>::value);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrder, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4, "this function assumes an unsigned int is 32-bit");
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    unsigned int, 
+    DesulToSYCLMemoryOrder<MemoryOrder>::value, 
+    DesulToSYCLMemoryScope<MemoryScope>::value, 
+    sycl::access::address_space::global_device_space> 
+  dest_ref(*reinterpret_cast<unsigned int*>(dest));
+  dest_ref.compare_exchange_strong(*reinterpret_cast<unsigned int*>(&compare), 
+                                   *reinterpret_cast<unsigned int*>(&value));
+  return compare;
+}
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrder, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8, "this function assumes an unsigned long long  is 64-bit");
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    unsigned long long int, 
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScope>::value, 
+    sycl::access::address_space::global_device_space> 
+  dest_ref(*reinterpret_cast<unsigned long long int*>(dest));
+  dest_ref.compare_exchange_strong(*reinterpret_cast<unsigned long long int*>(&compare),
+                                   *reinterpret_cast<unsigned long long int*>(&value));
+  return compare;
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrder, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4, "this function assumes an unsigned int is 32-bit");
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    unsigned int, 
+    DesulToSYCLMemoryOrder<MemoryOrder>::value, 
+    DesulToSYCLMemoryScope<MemoryScope>::value,  
+    sycl::access::address_space::global_device_space> 
+  dest_ref(*reinterpret_cast<unsigned int*>(dest));
+  unsigned int return_val = dest_ref.exchange(*reinterpret_cast<unsigned int*>(&value));
+  return reinterpret_cast<T&>(return_val);
+}
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrder, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8, "this function assumes an unsigned long long  is 64-bit");
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    unsigned long long int,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScope>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*reinterpret_cast<unsigned long long int*>(dest));
+  unsigned long long int return_val =
+      dest_ref.exchange(reinterpret_cast<unsigned long long int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type atomic_compare_exchange(
+    T* const /*dest*/, T compare, T /*value*/, MemoryOrder, MemoryScope) {
+  // FIXME_SYCL not implemented
+  assert(false);
+  return compare;  
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type atomic_exchange(
+    T* const /*dest*/, T value, MemoryOrder, MemoryScope) {
+  // FIXME_SYCL not implemented
+  assert(false);
+  return value;
+}
+
+}
+
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_Serial.hpp b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_Serial.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..be7b46d5fa0540f20abfb8903f20a3e2f7d80e5a
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_Serial.hpp
@@ -0,0 +1,45 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_SERIAL_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_SERIAL_HPP_
+
+#ifdef DESUL_HAVE_SERIAL_ATOMICS
+namespace desul {
+template<class MemoryScope>
+void atomic_thread_fence(MemoryOrderAcquire, MemoryScope) {
+}
+
+template<class MemoryScope>
+void atomic_thread_fence(MemoryOrderRelease, MemoryScope) {
+}
+
+template <typename T, class MemoryScope>
+T atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) {
+  T old = *dest;
+  if (old == compare) {
+    *dest = value;
+  } else {
+    old = compare;
+  }
+  return compare;
+}
+template <typename T, class MemoryScope>
+T atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) {
+  T old = *dest;
+  if (old == compare) {
+    *dest = value;
+  } else {
+    old = compare;
+  }
+  return compare;
+}
+}  // namespace desul
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/GCC.hpp b/packages/kokkos/core/src/desul/atomics/GCC.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cd0c2bea1180662969e0af8abee5d23a1b7334ca
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/GCC.hpp
@@ -0,0 +1,131 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_GCC_HPP_
+#define DESUL_ATOMICS_GCC_HPP_
+
+#ifdef DESUL_HAVE_GCC_ATOMICS
+
+#include<type_traits>
+/*
+Built - in Function : type __atomic_add_fetch(type * ptr, type val, int memorder)
+Built - in Function : type __atomic_sub_fetch(type * ptr, type val, int memorder)
+Built - in Function : type __atomic_and_fetch(type * ptr, type val, int memorder)
+Built - in Function : type __atomic_xor_fetch(type * ptr, type val, int memorder)
+Built - in Function : type __atomic_or_fetch(type * ptr, type val, int memorder)
+Built - in Function : type __atomic_nand_fetch(type * ptr, type val, int memorder)
+*/
+
+#define DESUL_GCC_INTEGRAL_OP_ATOMICS(MEMORY_ORDER, MEMORY_SCOPE)                 \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_fetch_add(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_fetch_add(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_fetch_sub(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_fetch_sub(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_fetch_and(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_fetch_and(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_fetch_or(   \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_fetch_or(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);   \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_fetch_xor(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_fetch_xor(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_fetch_nand( \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_fetch_nand(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value); \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_add_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_add_fetch(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_sub_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_sub_fetch(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_and_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_and_fetch(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_or_fetch(   \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_or_fetch(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);   \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_xor_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_xor_fetch(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_nand_fetch( \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_nand_fetch(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value); \
+  }
+
+namespace desul {
+DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderRelaxed, MemoryScopeNode)
+DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderRelaxed, MemoryScopeDevice)
+DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderRelaxed, MemoryScopeCore)
+DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderSeqCst, MemoryScopeNode)
+DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderSeqCst, MemoryScopeDevice)
+DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderSeqCst, MemoryScopeCore)
+
+template <typename T, class MemoryOrder, class MemoryScope>
+std::enable_if_t<!Impl::atomic_exchange_available_gcc<T>::value, T>
+atomic_exchange(T* const dest,
+                  Impl::dont_deduce_this_parameter_t<const T> val,
+                  MemoryOrder /*order*/,
+                  MemoryScope scope) {
+  // Acquire a lock for the address
+  while (!Impl::lock_address((void*)dest, scope)) {}
+
+  atomic_thread_fence(MemoryOrderAcquire(),scope);
+  T return_val = *dest;
+  *dest = val;
+  atomic_thread_fence(MemoryOrderRelease(),scope);
+  Impl::unlock_address((void*)dest, scope);
+  return return_val;
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+std::enable_if_t<!Impl::atomic_exchange_available_gcc<T>::value, T>
+atomic_compare_exchange(T* const dest,
+                  Impl::dont_deduce_this_parameter_t<const T> compare,
+                  Impl::dont_deduce_this_parameter_t<const T> val,
+                  MemoryOrder /*order*/,
+                  MemoryScope scope) {
+  // Acquire a lock for the address
+  while (!Impl::lock_address((void*)dest, scope)) {}
+
+  atomic_thread_fence(MemoryOrderAcquire(),scope);
+  T return_val = *dest;
+  if(return_val == compare) {
+    *dest = val;
+    atomic_thread_fence(MemoryOrderRelease(),scope);
+  }
+  Impl::unlock_address((void*)dest, scope);
+  return return_val;
+}
+}  // namespace desul
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Generic.hpp b/packages/kokkos/core/src/desul/atomics/Generic.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9d5e87ece29f2c444522a91e4635598872f5b71f
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Generic.hpp
@@ -0,0 +1,690 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_GENERIC_HPP_
+#define DESUL_ATOMICS_GENERIC_HPP_
+
+#include <type_traits>
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/Compare_Exchange.hpp"
+#include "desul/atomics/Lock_Array.hpp"
+#include "desul/atomics/Macros.hpp"
+// Combination operands to be used in an Compare and Exchange based atomic
+// operation
+namespace desul {
+namespace Impl {
+
+template <class Scalar1, class Scalar2>
+struct MaxOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return (val1 > val2 ? val1 : val2);
+  }
+  DESUL_FORCEINLINE_FUNCTION
+  static constexpr bool check_early_exit(Scalar1 const& val1, Scalar2 const& val2) {
+    return val1 > val2;
+  }
+};
+
+template <class Scalar1, class Scalar2>
+struct MinOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return (val1 < val2 ? val1 : val2);
+  }
+  DESUL_FORCEINLINE_FUNCTION
+  static constexpr bool check_early_exit(Scalar1 const& val1, Scalar2 const& val2) {
+    return val1 < val2;
+  }
+};
+
+// This exit early optimization causes weird compiler errors with MSVC 2019
+#ifndef DESUL_HAVE_MSVC_ATOMICS
+template <typename Op, typename Scalar1, typename Scalar2, typename = bool>
+struct may_exit_early : std::false_type {};
+
+template <typename Op, typename Scalar1, typename Scalar2>
+struct may_exit_early<Op,
+                      Scalar1,
+                      Scalar2,
+                      decltype(Op::check_early_exit(std::declval<Scalar1 const&>(),
+                                                    std::declval<Scalar2 const&>()))>
+    : std::true_type {};
+
+template <typename Op, typename Scalar1, typename Scalar2>
+constexpr DESUL_FUNCTION typename std::enable_if<may_exit_early<Op, Scalar1, Scalar2>::value, bool>::type
+check_early_exit(Op const&, Scalar1 const& val1, Scalar2 const& val2) {
+  return Op::check_early_exit(val1, val2);
+}
+
+template <typename Op, typename Scalar1, typename Scalar2>
+constexpr DESUL_FUNCTION typename std::enable_if<!may_exit_early<Op, Scalar1, Scalar2>::value, bool>::type
+check_early_exit(Op const&, Scalar1 const&, Scalar2 const&) {
+  return false;
+}
+#endif
+
+template <class Scalar1, class Scalar2>
+struct AddOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 + val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct SubOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 - val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct MulOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 * val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct DivOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 / val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct ModOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 % val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct AndOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 & val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct OrOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 | val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct XorOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 ^ val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct NandOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return ~(val1 & val2);
+  }
+};
+
+template <class Scalar1, class Scalar2>
+struct LShiftOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1 << val2;
+  }
+};
+
+template <class Scalar1, class Scalar2>
+struct RShiftOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1 >> val2;
+  }
+};
+
+template <class Scalar1, class Scalar2>
+struct StoreOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1&, const Scalar2& val2) { return val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct LoadOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2&) { return val1; }
+};
+
+
+template <class Oper, typename T, class MemoryOrder, class MemoryScope,
+  // equivalent to:
+  //   requires atomic_always_lock_free(sizeof(T))
+  std::enable_if_t<atomic_always_lock_free(sizeof(T)), int> = 0
+>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_oper(const Oper& op,
+                  T* const dest,
+                  dont_deduce_this_parameter_t<const T> val,
+                  MemoryOrder order,
+                  MemoryScope scope) {
+  using cas_t = typename atomic_compare_exchange_type<sizeof(T)>::type;
+  cas_t oldval = reinterpret_cast<cas_t&>(*dest);
+  cas_t assume = oldval;
+
+  do {
+#ifndef DESUL_HAVE_MSVC_ATOMICS
+    if (Impl::check_early_exit(op, reinterpret_cast<T&>(oldval), val)) return reinterpret_cast<T&>(oldval);
+#endif
+    assume = oldval;
+    T newval = op.apply(reinterpret_cast<T&>(assume), val);
+    oldval = desul::atomic_compare_exchange(
+        reinterpret_cast<cas_t*>(dest), assume, reinterpret_cast<cas_t&>(newval), order, scope);
+  } while (assume != oldval);
+
+  return reinterpret_cast<T&>(oldval);
+}
+
+template <class Oper, typename T, class MemoryOrder, class MemoryScope,
+  // equivalent to:
+  //   requires atomic_always_lock_free(sizeof(T))
+  std::enable_if_t<atomic_always_lock_free(sizeof(T)), int> = 0
+>
+DESUL_INLINE_FUNCTION T
+atomic_oper_fetch(const Oper& op,
+                  T* const dest,
+                  dont_deduce_this_parameter_t<const T> val,
+                  MemoryOrder order,
+                  MemoryScope scope) {
+  using cas_t = typename atomic_compare_exchange_type<sizeof(T)>::type;
+  cas_t oldval = reinterpret_cast<cas_t&>(*dest);
+  T newval = val;
+  cas_t assume = oldval;
+  do {
+#ifndef DESUL_HAVE_MSVC_ATOMICS
+    if (Impl::check_early_exit(op, reinterpret_cast<T&>(oldval), val)) return reinterpret_cast<T&>(oldval);
+#endif
+    assume = oldval;
+    newval = op.apply(reinterpret_cast<T&>(assume), val);
+    oldval = desul::atomic_compare_exchange(
+        reinterpret_cast<cas_t*>(dest), assume, reinterpret_cast<cas_t&>(newval), order, scope);
+  } while (assume != oldval);
+
+  return newval;
+}
+
+template <class Oper, typename T, class MemoryOrder, class MemoryScope,
+  // equivalent to:
+  //   requires !atomic_always_lock_free(sizeof(T))
+  std::enable_if_t<!atomic_always_lock_free(sizeof(T)), int> = 0
+>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_oper(const Oper& op,
+                  T* const dest,
+                  dont_deduce_this_parameter_t<const T> val,
+                  MemoryOrder /*order*/,
+                  MemoryScope scope) {
+#if defined(DESUL_HAVE_FORWARD_PROGRESS)
+  // Acquire a lock for the address
+  while (!Impl::lock_address((void*)dest, scope)) {}
+
+  atomic_thread_fence(MemoryOrderAcquire(),scope);
+  T return_val = *dest;
+  *dest = op.apply(return_val, val);
+  atomic_thread_fence(MemoryOrderRelease(),scope);
+  Impl::unlock_address((void*)dest, scope);
+  return return_val;
+#elif defined(DESUL_HAVE_GPU_LIKE_PROGRESS)
+  // This is a way to avoid dead lock in a warp or wave front
+  T return_val;
+  int done = 0;
+#ifdef __HIPCC__
+  unsigned long long int active = DESUL_IMPL_BALLOT_MASK(1);
+  unsigned long long int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_hip((void*)dest, scope)) {
+        atomic_thread_fence(MemoryOrderAcquire(), scope);
+        return_val = *dest;
+        *dest = op.apply(return_val, val);
+        atomic_thread_fence(MemoryOrderRelease(), scope);
+        Impl::unlock_address_hip((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(done);
+  }
+  return return_val;
+// FIXME_SYCL not implemented
+#elif defined(__SYCL_DEVICE_ONLY__)
+  (void) op;
+  (void) dest;
+  (void) scope;
+  (void) return_val;
+  (void) done;
+
+  assert(false);
+  return val;
+#else
+  unsigned int mask = DESUL_IMPL_ACTIVEMASK;
+  unsigned int active = DESUL_IMPL_BALLOT_MASK(mask, 1);
+  unsigned int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_cuda((void*)dest, scope)) {
+        atomic_thread_fence(MemoryOrderAcquire(),scope);
+        return_val = *dest;
+        *dest = op.apply(return_val, val);
+        atomic_thread_fence(MemoryOrderRelease(),scope);
+        Impl::unlock_address_cuda((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(mask, done);
+  }
+  return return_val;
+#endif
+#else
+  static_assert(false, "Unimplemented lock based attomic\n");
+  return val;
+#endif
+}
+
+template <class Oper, typename T, class MemoryOrder, class MemoryScope,
+  // equivalent to:
+  //   requires !atomic_always_lock_free(sizeof(T))
+  std::enable_if_t<!atomic_always_lock_free(sizeof(T)), int> = 0
+>
+DESUL_INLINE_FUNCTION T
+atomic_oper_fetch(const Oper& op,
+                  T* const dest,
+                  dont_deduce_this_parameter_t<const T> val,
+                  MemoryOrder /*order*/,
+                  MemoryScope scope) {
+#if defined(DESUL_HAVE_FORWARD_PROGRESS)
+  // Acquire a lock for the address
+  while (!Impl::lock_address((void*)dest, scope)) {}
+
+  atomic_thread_fence(MemoryOrderAcquire(),scope);
+  T return_val = op.apply(*dest, val);
+  *dest = return_val;
+  atomic_thread_fence(MemoryOrderRelease(),scope);
+  Impl::unlock_address((void*)dest, scope);
+  return return_val;
+#elif defined(DESUL_HAVE_GPU_LIKE_PROGRESS)
+  // This is a way to avoid dead lock in a warp or wave front
+  T return_val;
+  int done = 0;
+#ifdef __HIPCC__
+  unsigned long long int active = DESUL_IMPL_BALLOT_MASK(1);
+  unsigned long long int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_hip((void*)dest, scope)) {
+        atomic_thread_fence(MemoryOrderAcquire(), scope);
+        return_val = op.apply(*dest, val);
+        *dest = return_val;
+        atomic_thread_fence(MemoryOrderRelease(), scope);
+        Impl::unlock_address_hip((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(done);
+  }
+  return return_val;
+  // FIXME_SYCL not implemented
+#elif defined(__SYCL_DEVICE_ONLY__)
+  (void) op;
+  (void) dest;
+  (void) scope;
+  (void) done;
+
+  assert(false);
+  return val;
+#else
+  unsigned int mask = DESUL_IMPL_ACTIVEMASK;
+  unsigned int active = DESUL_IMPL_BALLOT_MASK(mask, 1);
+  unsigned int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_cuda((void*)dest, scope)) {
+        atomic_thread_fence(MemoryOrderAcquire(),scope);
+        return_val = op.apply(*dest, val);
+        *dest = return_val;
+        atomic_thread_fence(MemoryOrderRelease(),scope);
+        Impl::unlock_address_cuda((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(mask, done);
+  }
+  return return_val;
+#endif
+#else
+  static_assert(false, "Unimplemented lock based atomic\n");
+  return val;
+#endif
+}
+
+}  // namespace Impl
+}  // namespace desul
+
+namespace desul {
+
+// Fetch_Oper atomics: return value before operation
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_add(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::AddOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_sub(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::SubOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_max(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::MaxOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_min(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::MinOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_mul(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::MulOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_div(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::DivOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_mod(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::ModOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_and(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::AndOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_or(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::OrOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_xor(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::XorOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_nand(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::NandOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_fetch_lshift(T* const dest,
+                                            const unsigned int val,
+                                            MemoryOrder order,
+                                            MemoryScope scope) {
+  return Impl::atomic_fetch_oper(
+      Impl::LShiftOper<T, const unsigned int>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_fetch_rshift(T* const dest,
+                                            const unsigned int val,
+                                            MemoryOrder order,
+                                            MemoryScope scope) {
+  return Impl::atomic_fetch_oper(
+      Impl::RShiftOper<T, const unsigned int>(), dest, val, order, scope);
+}
+
+// Oper Fetch atomics: return value after operation
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_add_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::AddOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_sub_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::SubOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_max_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::MaxOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_min_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::MinOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_mul_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::MulOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_div_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::DivOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_mod_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::ModOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_and_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::AndOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_or_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::OrOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_xor_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::XorOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_nand_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::NandOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_lshift_fetch(T* const dest,
+                                            const unsigned int val,
+                                            MemoryOrder order,
+                                            MemoryScope scope) {
+  return Impl::atomic_oper_fetch(
+      Impl::LShiftOper<T, const unsigned int>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_rshift_fetch(T* const dest,
+                                            const unsigned int val,
+                                            MemoryOrder order,
+                                            MemoryScope scope) {
+  return Impl::atomic_oper_fetch(
+      Impl::RShiftOper<T, const unsigned int>(), dest, val, order, scope);
+}
+
+// Other atomics
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_load(const T* const dest,
+                                    MemoryOrder order,
+                                    MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::LoadOper<T, const T>(), const_cast<T*>(dest), T(), order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_store(T* const dest,
+                                        const T val,
+                                        MemoryOrder order,
+                                        MemoryScope scope) {
+  (void)Impl::atomic_fetch_oper(Impl::StoreOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_add(T* const dest,
+                                      const T val,
+                                      MemoryOrder order,
+                                      MemoryScope scope) {
+  (void)atomic_fetch_add(dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_sub(T* const dest,
+                                      const T val,
+                                      MemoryOrder order,
+                                      MemoryScope scope) {
+  (void)atomic_fetch_sub(dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_mul(T* const dest,
+                                      const T val,
+                                      MemoryOrder order,
+                                      MemoryScope scope) {
+  (void)atomic_fetch_mul(dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_div(T* const dest,
+                                      const T val,
+                                      MemoryOrder order,
+                                      MemoryScope scope) {
+  (void)atomic_fetch_div(dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_min(T* const dest,
+                                      const T val,
+                                      MemoryOrder order,
+                                      MemoryScope scope) {
+  (void)atomic_fetch_min(dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_max(T* const dest,
+                                      const T val,
+                                      MemoryOrder order,
+                                      MemoryScope scope) {
+  (void)atomic_fetch_max(dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_inc_fetch(T* const dest, MemoryOrder order, MemoryScope scope) {
+  return atomic_add_fetch(dest, T(1), order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_dec_fetch(T* const dest, MemoryOrder order, MemoryScope scope) {
+  return atomic_sub_fetch(dest, T(1), order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_fetch_inc(T* const dest,
+                                         MemoryOrder order,
+                                         MemoryScope scope) {
+  return atomic_fetch_add(dest, T(1), order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_fetch_dec(T* const dest,
+                                         MemoryOrder order,
+                                         MemoryScope scope) {
+  return atomic_fetch_sub(dest, T(1), order, scope);
+}
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_inc(T* const dest,
+                                         MemoryOrder order,
+                                         MemoryScope scope) {
+  return atomic_add(dest, T(1), order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_dec(T* const dest,
+                                         MemoryOrder order,
+                                         MemoryScope scope) {
+  return atomic_sub(dest, T(1), order, scope);
+}
+
+// FIXME
+template <typename T,
+          class SuccessMemoryOrder,
+          class FailureMemoryOrder,
+          class MemoryScope>
+DESUL_INLINE_FUNCTION bool atomic_compare_exchange_strong(
+    T* const dest,
+    T& expected,
+    T desired,
+    SuccessMemoryOrder success,
+    FailureMemoryOrder /*failure*/,
+    MemoryScope scope) {
+  T const old = atomic_compare_exchange(dest, expected, desired, success, scope);
+  if (old != expected) {
+    expected = old;
+    return false;
+  } else {
+    return true;
+  }
+}
+
+template <typename T,
+          class SuccessMemoryOrder,
+          class FailureMemoryOrder,
+          class MemoryScope>
+DESUL_INLINE_FUNCTION bool atomic_compare_exchange_weak(T* const dest,
+                                                        T& expected,
+                                                        T desired,
+                                                        SuccessMemoryOrder success,
+                                                        FailureMemoryOrder failure,
+                                                        MemoryScope scope) {
+  return atomic_compare_exchange_strong(
+      dest, expected, desired, success, failure, scope);
+}
+
+}  // namespace desul
+
+#include <desul/atomics/SYCL.hpp>
+#include <desul/atomics/CUDA.hpp>
+#include <desul/atomics/GCC.hpp>
+#include <desul/atomics/HIP.hpp>
+#include <desul/atomics/OpenMP.hpp>
+#pragma GCC diagnostic pop
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/HIP.hpp b/packages/kokkos/core/src/desul/atomics/HIP.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..16c1f510b7a2627408ccea374004d280997e96df
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/HIP.hpp
@@ -0,0 +1,338 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_HIP_HPP_
+#define DESUL_ATOMICS_HIP_HPP_
+
+#ifdef __HIP_DEVICE_COMPILE__
+namespace desul {
+namespace Impl {
+template <typename T>
+struct is_hip_atomic_integer_type {
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long int>::value;
+};
+
+template <typename T>
+struct is_hip_atomic_add_type {
+  static constexpr bool value = is_hip_atomic_integer_type<T>::value ||
+                                std::is_same<T, double>::value ||
+                                std::is_same<T, float>::value;
+};
+
+template <typename T>
+struct is_hip_atomic_sub_type {
+  static constexpr bool value =
+      std::is_same<T, int>::value || std::is_same<T, unsigned int>::value;
+};
+}  // namespace Impl
+
+// Atomic Add
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_add_type<T>::value, T>::type
+    atomic_fetch_add(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicAdd(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_add_type<T>::value, T>::type
+    atomic_fetch_add(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicAdd(dest, val);
+  __threadfence();
+
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_add_type<T>::value, T>::type
+    atomic_fetch_add(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_add(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Sub
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_sub_type<T>::value, T>::type
+    atomic_fetch_sub(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicSub(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_sub_type<T>::value, T>::type
+    atomic_fetch_sub(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicSub(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_sub_type<T>::value, T>::type
+    atomic_fetch_sub(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_sub(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Inc
+__device__ inline unsigned int atomic_fetch_inc(unsigned int* dest,
+                                                unsigned int val,
+                                                MemoryOrderRelaxed,
+                                                MemoryScopeDevice) {
+  return atomicInc(dest, val);
+}
+
+template <typename MemoryOrder>
+__device__ inline unsigned int atomic_fetch_inc(unsigned int* dest,
+                                                unsigned int val,
+                                                MemoryOrder,
+                                                MemoryScopeDevice) {
+  __threadfence();
+  unsigned int return_val = atomicInc(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename MemoryOrder>
+__device__ inline unsigned int atomic_fetch_inc(unsigned int* dest,
+                                                unsigned int val,
+                                                MemoryOrder,
+                                                MemoryScopeCore) {
+  return atomic_fetch_inc(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Dec
+__device__ inline unsigned int atomic_fetch_dec(unsigned int* dest,
+                                                unsigned int val,
+                                                MemoryOrderRelaxed,
+                                                MemoryScopeDevice) {
+  return atomicDec(dest, val);
+}
+
+template <typename MemoryOrder>
+__device__ inline unsigned int atomic_fetch_dec(unsigned int* dest,
+                                                unsigned int val,
+                                                MemoryOrder,
+                                                MemoryScopeDevice) {
+  __threadfence();
+  unsigned int return_val = atomicDec(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename MemoryOrder>
+__device__ inline unsigned int atomic_fetch_dec(unsigned int* dest,
+                                                unsigned int val,
+                                                MemoryOrder,
+                                                MemoryScopeCore) {
+  return atomic_fetch_dec(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Max
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_max(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicMax(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_max(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicMax(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_max(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_max(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Min
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_min(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicMin(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_min(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicMin(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_min(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_min(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic And
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_and(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicAnd(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_and(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicAnd(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_and(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_and(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic XOR
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_xor(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicXor(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_xor(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicXor(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_xor(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_xor(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic OR
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_or(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicOr(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_or(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicOr(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_or(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_or(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+}
+
+#define DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MEMORY_ORDER, MEMORY_SCOPE)                 \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_add_type<T>::value, T>::type atomic_fetch_add(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_fetch_oper(Impl::AddOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_sub_type<T>::value, T>::type atomic_fetch_sub(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_fetch_oper(Impl::SubOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_fetch_and(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_fetch_oper(Impl::AndOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_fetch_or(   \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_fetch_oper(Impl::OrOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_fetch_xor(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_fetch_oper(Impl::XorOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_fetch_nand( \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_fetch_oper(Impl::NandOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_add_type<T>::value, T>::type atomic_add_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_oper_fetch(Impl::AddOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_sub_type<T>::value, T>::type atomic_sub_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_oper_fetch(Impl::SubOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_and_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_oper_fetch(Impl::AndOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_or_fetch(   \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_oper_fetch(Impl::OrOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_xor_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_oper_fetch(Impl::XorOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_nand_fetch( \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_oper_fetch(Impl::NandOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }
+namespace desul {
+DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MemoryOrderRelaxed, MemoryScopeNode)
+DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MemoryOrderRelaxed, MemoryScopeDevice)
+DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MemoryOrderRelaxed, MemoryScopeCore)
+DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MemoryOrderSeqCst, MemoryScopeNode)
+DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MemoryOrderSeqCst, MemoryScopeDevice)
+DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MemoryOrderSeqCst, MemoryScopeCore)
+}  // namespace desul
+
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Lock_Array.hpp b/packages/kokkos/core/src/desul/atomics/Lock_Array.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8fd0e8bbd7718a1097d898b01b20aa71ff515f2f
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Lock_Array.hpp
@@ -0,0 +1,75 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_LOCK_ARRAY_HPP_
+#define DESUL_ATOMICS_LOCK_ARRAY_HPP_
+
+#include "desul/atomics/Compare_Exchange.hpp"
+#include "desul/atomics/Lock_Array_Cuda.hpp"
+#include "desul/atomics/Lock_Array_HIP.hpp"
+#include "desul/atomics/Macros.hpp"
+
+namespace desul {
+namespace Impl {
+struct host_locks__ {
+  static constexpr uint32_t HOST_SPACE_ATOMIC_MASK = 0xFFFF;
+  static constexpr uint32_t HOST_SPACE_ATOMIC_XOR_MASK = 0x5A39;
+  template <typename is_always_void = void>
+  static int32_t* get_host_locks_() {
+    static int32_t HOST_SPACE_ATOMIC_LOCKS_DEVICE[HOST_SPACE_ATOMIC_MASK + 1] = {0};
+    return HOST_SPACE_ATOMIC_LOCKS_DEVICE;
+  }
+  static inline int32_t* get_host_lock_(void* ptr) {
+    return &get_host_locks_()[((uint64_t(ptr) >> 2) & HOST_SPACE_ATOMIC_MASK) ^
+                              HOST_SPACE_ATOMIC_XOR_MASK];
+  }
+};
+
+inline void init_lock_arrays() {
+  static bool is_initialized = false;
+  if (!is_initialized) {
+    host_locks__::get_host_locks_();
+    is_initialized = true;
+  }
+
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+  init_lock_arrays_cuda();
+#endif
+
+#ifdef DESUL_HAVE_HIP_ATOMICS
+  init_lock_arrays_hip();
+#endif
+}
+
+inline void finalize_lock_arrays() {
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+  finalize_lock_arrays_cuda();
+#endif
+
+#ifdef DESUL_HAVE_HIP_ATOMICS
+  finalize_lock_arrays_hip();
+#endif
+}
+template <typename MemoryScope>
+inline bool lock_address(void* ptr, MemoryScope ms) {
+  return 0 == atomic_exchange(host_locks__::get_host_lock_(ptr),
+                                      int32_t(1),
+                                      MemoryOrderSeqCst(),
+                                      ms);
+}
+template <typename MemoryScope>
+void unlock_address(void* ptr, MemoryScope ms) {
+  (void)atomic_exchange(host_locks__::get_host_lock_(ptr),
+                                int32_t(0),
+                                MemoryOrderSeqCst(),
+                                ms);
+}
+}  // namespace Impl
+}  // namespace desul
+
+#endif  // DESUL_ATOMICS_LOCK_ARRAY_HPP_
diff --git a/packages/kokkos/core/src/desul/atomics/Lock_Array_Cuda.hpp b/packages/kokkos/core/src/desul/atomics/Lock_Array_Cuda.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..de99185349043dc6e0f13c7e57c14dbc080deb9e
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Lock_Array_Cuda.hpp
@@ -0,0 +1,172 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_
+#define DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_
+
+#include "desul/atomics/Macros.hpp"
+#include "desul/atomics/Common.hpp"
+
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+
+#include <cstdint>
+
+namespace desul {
+namespace Impl {
+
+#ifdef __CUDA_ARCH__
+#define DESUL_IMPL_BALLOT_MASK(m, x) __ballot_sync(m, x)
+#define DESUL_IMPL_ACTIVEMASK __activemask()
+#else
+#define DESUL_IMPL_BALLOT_MASK(m, x) m==0?0:1
+#define DESUL_IMPL_ACTIVEMASK 0
+#endif
+
+/// \brief This global variable in Host space is the central definition
+///        of these arrays.
+extern int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h;
+extern int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE_h;
+
+
+/// \brief After this call, the g_host_cuda_lock_arrays variable has
+///        valid, initialized arrays.
+///
+/// This call is idempotent.
+/// The function is templated to make it a weak symbol to deal with Kokkos/RAJA
+///   snapshotted version while also linking against pure Desul
+template<typename /*AlwaysInt*/ = int>
+void init_lock_arrays_cuda();
+
+/// \brief After this call, the g_host_cuda_lock_arrays variable has
+///        all null pointers, and all array memory has been freed.
+///
+/// This call is idempotent.
+/// The function is templated to make it a weak symbol to deal with Kokkos/RAJA
+///   snappshotted version while also linking against pure Desul
+template<typename T = int>
+void finalize_lock_arrays_cuda();
+
+}  // namespace Impl
+}  // namespace desul
+
+#if defined(__CUDACC__)
+
+namespace desul {
+namespace Impl {
+
+/// \brief This global variable in CUDA space is what kernels use
+///        to get access to the lock arrays.
+///
+/// When relocatable device code is enabled, there can be one single
+/// instance of this global variable for the entire executable,
+/// whose definition will be in Kokkos_Cuda_Locks.cpp (and whose declaration
+/// here must then be extern.
+/// This one instance will be initialized by initialize_host_cuda_lock_arrays
+/// and need not be modified afterwards.
+///
+/// When relocatable device code is disabled, an instance of this variable
+/// will be created in every translation unit that sees this header file
+/// (we make this clear by marking it static, meaning no other translation
+///  unit can link to it).
+/// Since the Kokkos_Cuda_Locks.cpp translation unit cannot initialize the
+/// instances in other translation units, we must update this CUDA global
+/// variable based on the Host global variable prior to running any kernels
+/// that will use it.
+/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
+__device__
+#ifdef __CUDACC_RDC__
+    __constant__ extern
+#endif
+    int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE;
+
+__device__
+#ifdef __CUDACC_RDC__
+    __constant__ extern
+#endif
+    int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE;
+
+#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
+
+/// \brief Acquire a lock for the address
+///
+/// This function tries to acquire the lock for the hash value derived
+/// from the provided ptr. If the lock is successfully acquired the
+/// function returns true. Otherwise it returns false.
+__device__ inline bool lock_address_cuda(void* ptr, desul::MemoryScopeDevice) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  return (0 == atomicExch(&desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE[offset], 1));
+}
+__device__ inline bool lock_address_cuda(void* ptr, desul::MemoryScopeNode) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  return (0 == atomicExch(&desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE[offset], 1));
+}
+
+/// \brief Release lock for the address
+///
+/// This function releases the lock for the hash value derived
+/// from the provided ptr. This function should only be called
+/// after previously successfully acquiring a lock with
+/// lock_address.
+__device__ inline void unlock_address_cuda(void* ptr, desul::MemoryScopeDevice) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  atomicExch(&desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE[offset], 0);
+}
+__device__ inline void unlock_address_cuda(void* ptr, desul::MemoryScopeNode) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  atomicExch(&desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE[offset], 0);
+}
+
+}  // namespace Impl
+}  // namespace desul
+
+// Make lock_array_copied an explicit translation unit scope thingy
+namespace desul {
+namespace Impl {
+namespace {
+static int lock_array_copied = 0;
+inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
+}  // namespace
+}  // namespace Impl
+}  // namespace desul
+/* It is critical that this code be a macro, so that it will
+   capture the right address for desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE
+   putting this in an inline function will NOT do the right thing! */
+#define DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()                       \
+  {                                                                        \
+    if (::desul::Impl::lock_array_copied == 0) {                           \
+      cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE,    \
+                         &::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h, \
+                         sizeof(int32_t*));                                \
+      cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE,    \
+                         &::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE_h, \
+                         sizeof(int32_t*));                                \
+    }                                                                      \
+    ::desul::Impl::lock_array_copied = 1;                                  \
+  }
+
+
+#endif /* defined( __CUDACC__ ) */
+
+#endif /* defined( KOKKOS_ENABLE_CUDA ) */
+
+#if defined(__CUDACC_RDC__) || (!defined(__CUDACC__))
+#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
+#else
+#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
+  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
+#endif
+
+#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP_ */
diff --git a/packages/kokkos/core/src/desul/atomics/Lock_Array_HIP.hpp b/packages/kokkos/core/src/desul/atomics/Lock_Array_HIP.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9e6f5e59800b6778bf2c0592f0104526a730ac00
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Lock_Array_HIP.hpp
@@ -0,0 +1,170 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATORMICS_LOCK_ARRAY_HIP_HPP_
+#define DESUL_ATORMICS_LOCK_ARRAY_HIP_HPP_
+
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/Macros.hpp"
+
+#ifdef DESUL_HAVE_HIP_ATOMICS
+
+#include <hip/hip_runtime.h>
+
+#include <cstdint>
+
+namespace desul {
+namespace Impl {
+
+#ifdef __HIP_DEVICE_COMPILE__
+#define DESUL_IMPL_BALLOT_MASK(x) __ballot(x)
+#endif
+
+/**
+ * \brief This global variable in Host space is the central definition of these
+ * arrays.
+ */
+extern int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE_h;
+extern int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE_h;
+
+/// \brief After this call, the g_host_cuda_lock_arrays variable has
+///        valid, initialized arrays.
+///
+/// This call is idempotent.
+/// The function is templated to make it a weak symbol to deal with Kokkos/RAJA
+///   snappshotted version while also linking against pure Desul
+template<typename T = int>
+void init_lock_arrays_hip();
+
+/// \brief After this call, the g_host_cuda_lock_arrays variable has
+///        all null pointers, and all array memory has been freed.
+///
+/// This call is idempotent.
+/// The function is templated to make it a weak symbol to deal with Kokkos/RAJA
+///   snappshotted version while also linking against pure Desul
+template<typename T = int>
+void finalize_lock_arrays_hip();
+}  // namespace Impl
+}  // namespace desul
+
+#ifdef __HIPCC__
+namespace desul {
+namespace Impl {
+
+/**
+ * \brief This global variable in HIP space is what kernels use to get access
+ * to the lock arrays.
+ *
+ * When relocatable device code is enabled, there can be one single instance of
+ * this global variable for the entire executable, whose definition will be in
+ * Kokkos_HIP_Locks.cpp (and whose declaration here must then be extern.  This
+ * one instance will be initialized by initialize_host_hip_lock_arrays and need
+ * not be modified afterwards.
+ *
+ * When relocatable device code is disabled, an instance of this variable will
+ * be created in every translation unit that sees this header file (we make this
+ * clear by marking it static, meaning no other translation unit can link to
+ * it). Since the Kokkos_HIP_Locks.cpp translation unit cannot initialize the
+ * instances in other translation units, we must update this CUDA global
+ * variable based on the Host global variable prior to running any kernels that
+ * will use it.  That is the purpose of the
+ * KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE macro.
+ */
+__device__
+#ifdef DESUL_HIP_RDC
+    __constant__ extern
+#endif
+    int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE;
+
+__device__
+#ifdef DESUL_HIP_RDC
+    __constant__ extern
+#endif
+    int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE;
+
+#define HIP_SPACE_ATOMIC_MASK 0x1FFFF
+
+/// \brief Acquire a lock for the address
+///
+/// This function tries to acquire the lock for the hash value derived
+/// from the provided ptr. If the lock is successfully acquired the
+/// function returns true. Otherwise it returns false.
+__device__ inline bool lock_address_hip(void* ptr, desul::MemoryScopeDevice) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & HIP_SPACE_ATOMIC_MASK;
+  return (0 == atomicExch(&desul::Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE[offset], 1));
+}
+
+__device__ inline bool lock_address_hip(void* ptr, desul::MemoryScopeNode) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & HIP_SPACE_ATOMIC_MASK;
+  return (0 == atomicExch(&desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE[offset], 1));
+}
+
+/**
+ * \brief Release lock for the address
+ *
+ * This function releases the lock for the hash value derived from the provided
+ * ptr. This function should only be called after previously successfully
+ * acquiring a lock with lock_address.
+ */
+__device__ inline void unlock_address_hip(void* ptr, desul::MemoryScopeDevice) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & HIP_SPACE_ATOMIC_MASK;
+  atomicExch(&desul::Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE[offset], 0);
+}
+
+__device__ inline void unlock_address_hip(void* ptr, desul::MemoryScopeNode) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & HIP_SPACE_ATOMIC_MASK;
+  atomicExch(&desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE[offset], 0);
+}
+#endif
+}  // namespace Impl
+}  // namespace desul
+
+// Make lock_array_copied an explicit translation unit scope thing
+namespace desul {
+namespace Impl {
+namespace {
+static int lock_array_copied = 0;
+inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
+}  // namespace
+}  // namespace Impl
+}  // namespace desul
+
+/* It is critical that this code be a macro, so that it will
+   capture the right address for g_device_hip_lock_arrays!
+   putting this in an inline function will NOT do the right thing! */
+#define DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()                               \
+  {                                                                               \
+    if (::desul::Impl::lock_array_copied == 0) {                                  \
+      (void) hipMemcpyToSymbol(HIP_SYMBOL(::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE), \
+                        &::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE_h,          \
+                        sizeof(int32_t*));                                        \
+      (void) hipMemcpyToSymbol(HIP_SYMBOL(::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE),   \
+                        &::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE_h,            \
+                        sizeof(int32_t*));                                        \
+    }                                                                             \
+    ::desul::Impl::lock_array_copied = 1;                                         \
+  }
+
+#endif
+
+#if defined(DESUL_HIP_RDC) || (!defined(__HIPCC__))
+#define DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE()
+#else
+#define DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() \
+  DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()
+#endif
+
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Macros.hpp b/packages/kokkos/core/src/desul/atomics/Macros.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..db9962e03bd84052a4d61a89cf39892e40051b89
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Macros.hpp
@@ -0,0 +1,62 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_MACROS_HPP_
+#define DESUL_ATOMICS_MACROS_HPP_
+
+// Macros
+
+#if defined(__GNUC__) && \
+    (!defined(__CUDA_ARCH__) || !defined(__NVCC__)) && \
+    (!defined(__HIP_DEVICE_COMPILE) || !defined(__HIP_PLATFORM_HCC__)) && \
+    !defined(__SYCL_DEVICE_ONLY__) && \
+    !defined(DESUL_HAVE_OPENMP_ATOMICS) && \
+    !defined(DESUL_HAVE_SERIAL_ATOMICS)
+#define DESUL_HAVE_GCC_ATOMICS
+#endif
+
+#ifdef _MSC_VER
+#define DESUL_HAVE_MSVC_ATOMICS
+#endif
+
+#ifdef __CUDACC__
+#define DESUL_HAVE_CUDA_ATOMICS
+#endif
+
+#ifdef __HIPCC__
+#define DESUL_HAVE_HIP_ATOMICS
+#endif
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define DESUL_HAVE_SYCL_ATOMICS
+#ifdef __clang__
+#define DESUL_SYCL_NAMESPACE sycl::ONEAPI
+#else
+#define DESUL_SYCL_NAMESPACE sycl
+#endif
+#endif
+
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || defined(__SYCL_DEVICE_ONLY__)
+#define DESUL_HAVE_GPU_LIKE_PROGRESS
+#endif
+
+#if defined(DESUL_HAVE_CUDA_ATOMICS) || defined(DESUL_HAVE_HIP_ATOMICS)
+#define DESUL_FORCEINLINE_FUNCTION inline __host__ __device__
+#define DESUL_INLINE_FUNCTION inline __host__ __device__
+#define DESUL_FUNCTION __host__ __device__
+#else
+#define DESUL_FORCEINLINE_FUNCTION inline
+#define DESUL_INLINE_FUNCTION inline
+#define DESUL_FUNCTION
+#endif
+
+#if !defined(DESUL_HAVE_GPU_LIKE_PROGRESS)
+#define DESUL_HAVE_FORWARD_PROGRESS
+#endif
+
+#endif  // DESUL_ATOMICS_MACROS_HPP_
diff --git a/packages/kokkos/core/src/desul/atomics/OpenMP.hpp b/packages/kokkos/core/src/desul/atomics/OpenMP.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3fa22c36aca37e9e91e5b08aac9b3e61b8256ebc
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/OpenMP.hpp
@@ -0,0 +1,15 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_OPENMP_HPP_
+#define DESUL_ATOMICS_OPENMP_HPP_
+
+#ifdef DESUL_HAVE_OPENMP_ATOMICS
+
+#include<desul/atomics/openmp/OpenMP_40.hpp>
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/SYCL.hpp b/packages/kokkos/core/src/desul/atomics/SYCL.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..44e2dc0ec4ea843d6b6b4e9896b27fc63df6baad
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/SYCL.hpp
@@ -0,0 +1,143 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_SYCL_HPP_
+#define DESUL_ATOMICS_SYCL_HPP_
+
+#ifdef DESUL_HAVE_SYCL_ATOMICS
+#include "desul/atomics/Common.hpp"
+
+namespace desul {
+namespace Impl {
+template<class T>
+struct is_sycl_atomic_type {
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+				std::is_same<T, long>::value ||
+				std::is_same<T, unsigned long>::value ||
+				std::is_same<T, long long>::value ||
+                                std::is_same<T, unsigned long long int>::value ||
+				std::is_same<T, float>::value ||
+				std::is_same<T, double>::value;
+};
+} // Impl
+
+// Atomic Add
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_add(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T, 
+    DesulToSYCLMemoryOrder<MemoryOrder>::value, 
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,  
+    sycl::access::address_space::global_device_space> 
+  dest_ref(*dest);
+  return dest_ref.fetch_add(val);
+}
+
+// Atomic Sub 
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_sub(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*dest);
+  return dest_ref.fetch_sub(val);
+}
+
+// Atomic Inc
+template<class MemoryOrder/*, class MemoryScope*/>
+inline
+unsigned int atomic_fetch_inc(unsigned int* dest, unsigned int val, MemoryOrder memory_order, MemoryScopeDevice memory_scope) {
+  return atomic_fetch_add(dest, val, memory_order, memory_scope);
+}
+
+// Atomic Dec
+template<class MemoryOrder/*, class MemoryScope*/>
+inline
+unsigned int atomic_fetch_dec(unsigned int* dest, unsigned int val, MemoryOrder memory_order, MemoryScopeDevice memory_scope) {
+  return atomic_fetch_sub(dest, val, memory_order, memory_scope);
+}
+
+// Atomic Max
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_max(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*dest);
+  return dest_ref.fetch_max(val);
+}
+
+// Atomic Min
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_min(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*dest);
+  return dest_ref.fetch_min(val);
+}
+
+// Atomic And
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_and(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*dest);
+  return dest_ref.fetch_and(val);
+}
+
+// Atomic XOR
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_xor(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*dest);
+  return dest_ref.fetch_xor(val);
+}
+
+// Atomic OR
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_or(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*dest);
+  return dest_ref.fetch_or(val);
+}
+
+} // desul
+#endif  // DESUL_HAVE_SYCL_ATOMICS
+#endif  // DESUL_ATOMICS_SYCL_HPP_
diff --git a/packages/kokkos/core/src/desul/atomics/SYCLConversions.hpp b/packages/kokkos/core/src/desul/atomics/SYCLConversions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a66e5cf051f684695b17b7fae9fe2aaa5009a2c3
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/SYCLConversions.hpp
@@ -0,0 +1,58 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_SYCL_CONVERSIONS_HPP_
+#define DESUL_ATOMICS_SYCL_CONVERSIONS_HPP_
+#ifdef DESUL_HAVE_SYCL_ATOMICS
+#include "desul/atomics/Common.hpp"
+#include <CL/sycl.hpp>
+
+namespace desul {
+
+template<class MemoryOrder>
+struct DesulToSYCLMemoryOrder;
+template<>
+struct DesulToSYCLMemoryOrder<MemoryOrderSeqCst> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_order value = DESUL_SYCL_NAMESPACE::memory_order::seq_cst;
+};
+template<>
+struct DesulToSYCLMemoryOrder<MemoryOrderAcquire> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_order value = DESUL_SYCL_NAMESPACE::memory_order::acquire;
+};
+template<>
+struct DesulToSYCLMemoryOrder<MemoryOrderRelease> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_order value = DESUL_SYCL_NAMESPACE::memory_order::release;
+};
+template<>
+struct DesulToSYCLMemoryOrder<MemoryOrderAcqRel> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_order value = DESUL_SYCL_NAMESPACE::memory_order::acq_rel;
+};
+template<>
+struct DesulToSYCLMemoryOrder<MemoryOrderRelaxed> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_order value = DESUL_SYCL_NAMESPACE::memory_order::relaxed;
+};
+
+template<class MemoryScope>
+struct DesulToSYCLMemoryScope;
+template<>
+struct DesulToSYCLMemoryScope<MemoryScopeCore> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_scope value = DESUL_SYCL_NAMESPACE::memory_scope::work_group;
+};
+template<>
+struct DesulToSYCLMemoryScope<MemoryScopeDevice> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_scope value = DESUL_SYCL_NAMESPACE::memory_scope::device;
+};
+template<>
+struct DesulToSYCLMemoryScope<MemoryScopeSystem> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_scope value = DESUL_SYCL_NAMESPACE::memory_scope::system;
+};
+
+}
+
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/CUDA_asm.hpp b/packages/kokkos/core/src/desul/atomics/cuda/CUDA_asm.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..461d3e0928a19840973d1c971ccd100968193a42
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/CUDA_asm.hpp
@@ -0,0 +1,18 @@
+#include<limits>
+namespace desul {
+#if defined(__CUDA_ARCH__)  || (defined(__clang__) && !defined(__NVCC__))
+// Choose the variant of atomics we are using later
+#if !defined(DESUL_IMPL_ATOMIC_CUDA_PTX_GENERIC) && \
+    !defined(DESUL_IMPL_ATOMIC_CUDA_PTX_PREDICATE) && \
+    !defined(DESUL_IMPL_ATOMIC_CUDA_PTX_ISGLOBAL) && \
+    !defined(DESUL_IMPL_ATOMIC_CUDA_PTX_FORCEGLOBAL)
+#if (__CUDACC_VER_MAJOR__ > 11) || ((__CUDACC_VER_MAJOR__==11) && (__CUDACC_VER_MINOR__>1))
+#define DESUL_IMPL_ATOMIC_CUDA_PTX_ISGLOBAL
+#else
+#define DESUL_IMPL_ATOMIC_CUDA_PTX_PREDICATE
+#endif
+#endif
+#include<desul/atomics/cuda/cuda_cc7_asm.inc>
+
+#endif
+}
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/CUDA_asm_exchange.hpp b/packages/kokkos/core/src/desul/atomics/cuda/CUDA_asm_exchange.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0ab95e6a00a4e67ffad0232f8652e8a0c9d4f6e6
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/CUDA_asm_exchange.hpp
@@ -0,0 +1,8 @@
+#include<limits>
+namespace desul {
+#if defined(__CUDA_ARCH__)  || (defined(__clang__) && !defined(__NVCC__))
+
+#include<desul/atomics/cuda/cuda_cc7_asm_exchange.inc>
+
+#endif
+}
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm.inc b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm.inc
new file mode 100644
index 0000000000000000000000000000000000000000..2bc64a74b2caf06a731a319c07c7b26490924aa3
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm.inc
@@ -0,0 +1,20 @@
+
+// Non returning atomic operation (ptx red instruction) only exists for relaxed and release memorder
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE MemoryScopeDevice
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".gpu"
+#include "desul/atomics/cuda/cuda_cc7_asm_memorder.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE MemoryScopeNode
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".sys"
+#include "desul/atomics/cuda/cuda_cc7_asm_memorder.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE MemoryScopeCore
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".cta"
+#include "desul/atomics/cuda/cuda_cc7_asm_memorder.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM
+
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc
new file mode 100644
index 0000000000000000000000000000000000000000..6de590a952fa38f60c58a34c9499a420502ae381
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc
@@ -0,0 +1,18 @@
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_GENERIC
+#include "cuda_cc7_asm_atomic_fetch_op.inc_generic"
+#endif
+
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_ISGLOBAL
+#include "cuda_cc7_asm_atomic_fetch_op.inc_isglobal"
+#endif
+
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_PREDICATE
+#include "cuda_cc7_asm_atomic_fetch_op.inc_predicate"
+#endif
+
+// This version is not generally safe
+// Only here for performance comparison purposes
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_FORCEGLOBAL
+#include "cuda_cc7_asm_atomic_fetch_op.inc_forceglobal"
+#endif
+
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal
new file mode 100644
index 0000000000000000000000000000000000000000..d00e2223d22485f4ee831dd53dce064a49deec5e
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal
@@ -0,0 +1,143 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// binary operations
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+// Fetch atomics
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_add(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile("atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_sub(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  ctype neg_value = -value; \
+  asm volatile("atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(neg_value) : "memory"); \
+  return result; \
+}
+
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_min(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile("atom.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_max(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile("atom.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_inc(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  asm volatile("atom.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_dec(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  asm volatile("atom.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  return result; \
+}
+
+// Group ops for integer ctypes
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR(ctype)
+
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(double,".f64","d","=d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(double,".f64","d","=d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int32_t,".s32","r","=r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int64_t,".s64","l","=l")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint32_t,".u32","r","=r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP()
+
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND
+
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic
new file mode 100644
index 0000000000000000000000000000000000000000..364b6a2e4d1950f110c29958976a660d01d05771
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic
@@ -0,0 +1,142 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops: 
+
+// binary operations
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.and" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.and" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.or" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.or" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.xor" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.xor" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+// Fetch atomics
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_add(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile("atom.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_sub(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  ctype neg_value = -value; \
+  asm volatile("atom.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(neg_value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_min(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile("atom.min" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_max(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile("atom.max" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_inc(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  asm volatile("atom.inc" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_dec(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  asm volatile("atom.dec" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  return result; \
+}
+
+// Group ops for integer ctypes
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR(ctype)
+
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(double,".f64","d","=d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(double,".f64","d","=d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int32_t,".s32","r","=r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int64_t,".s64","l","=l")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint32_t,".u32","r","=r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP()
+
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND
+
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal
new file mode 100644
index 0000000000000000000000000000000000000000..2e8e54062dd3494f7440b959618359ef0547d87b
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal
@@ -0,0 +1,190 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// binary operations
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  } else { \
+  asm volatile("atom.and"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  } \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  } else { \
+  asm volatile("atom.and"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  } \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  } else { \
+  asm volatile("atom.or"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  } \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  } else { \
+  asm volatile("atom.or"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  } \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  } else { \
+  asm volatile("atom.xor"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  } \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  } else { \
+  asm volatile("atom.xor"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  } \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+// Fetch atomics
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_add(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  } else { \
+  asm volatile("atom.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  } \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_sub(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  ctype neg_value = -value; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(neg_value) : "memory"); \
+  } else { \
+  asm volatile("atom.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(neg_value) : "memory"); \
+  } \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_min(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  } else { \
+  asm volatile("atom.min"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  } \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_max(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  } else { \
+  asm volatile("atom.max"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  } \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_inc(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  } else { \
+  asm volatile("atom.inc"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  } \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_dec(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  } else { \
+  asm volatile("atom.dec"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  } \
+  return result; \
+}
+
+// Group ops for integer ctypes
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR(ctype)
+
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(double,".f64","d","=d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(double,".f64","d","=d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int32_t,".s32","r","=r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int64_t,".s64","l","=l")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint32_t,".u32","r","=r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP()
+
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND
+
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate
new file mode 100644
index 0000000000000000000000000000000000000000..5f53279daf541aad169e1bc5a046518cc5b084c8
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate
@@ -0,0 +1,226 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// binary operations
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;\n\t" \
+          "@!p atom.and"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;\n\t" \
+          "@!p atom.and"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;\n\t" \
+          "@!p atom.or"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;\n\t" \
+          "@!p atom.or"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;\n\t" \
+          "@!p atom.xor"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;\n\t" \
+          "@!p atom.xor"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+// Fetch atomics
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_add(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "@!p atom.add"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_sub(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  ctype neg_value = -value; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "@!p atom.add"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : reg_ret_ctype(result) : "l"(dest),reg_ctype(neg_value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_min(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "@!p atom.min"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_max(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "@!p atom.max"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_inc(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.inc.gobal" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "@!p atom.inc"       __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_dec(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "@!p atom.dec"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  return result; \
+}
+
+// Group ops for integer ctypes
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR(ctype)
+
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(double,".f64","d","=d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(double,".f64","d","=d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int32_t,".s32","r","=r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int64_t,".s64","l","=l")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint32_t,".u32","r","=r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP()
+
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND
+
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc
new file mode 100644
index 0000000000000000000000000000000000000000..ca02410515db3491c002b569f01ee150d1c0c683
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc
@@ -0,0 +1,18 @@
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_GENERIC
+#include "cuda_cc7_asm_atomic_op.inc_generic"
+#endif
+
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_ISGLOBAL
+#include "cuda_cc7_asm_atomic_op.inc_isglobal"
+#endif
+
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_PREDICATE
+#include "cuda_cc7_asm_atomic_op.inc_predicate"
+#endif
+
+// This version is not generally safe
+// Only here for performance comparison purposes
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_FORCEGLOBAL
+#include "cuda_cc7_asm_atomic_op.inc_forceglobal"
+#endif
+
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal
new file mode 100644
index 0000000000000000000000000000000000000000..3767b2ab4980c0d811357a6d0e1a912de5bba500
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal
@@ -0,0 +1,64 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// Non Returning Atomic Operations
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+inline __device__ void atomic_add(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile("red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+inline __device__ void atomic_sub(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type neg_value = -value; \
+  asm volatile("red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(neg_value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+inline __device__ void atomic_min(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile("red.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+inline __device__ void atomic_max(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile("red.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+inline __device__ void atomic_inc(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  asm volatile("red.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type) \
+inline __device__ void atomic_dec(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  asm volatile("red.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+}
+
+// Group ops for integer types
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type)
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(double,".f64","d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(double,".f64","d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(uint32_t,".u32","r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int32_t,".s32","r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".s64","l")
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic
new file mode 100644
index 0000000000000000000000000000000000000000..5de36a3e0a87b967fff5d9a936644c5e8a566051
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic
@@ -0,0 +1,64 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// Non Returning Atomic Operations
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+inline __device__ void atomic_add(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile("red.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+inline __device__ void atomic_sub(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type neg_value = -value; \
+  asm volatile("red.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(neg_value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+inline __device__ void atomic_min(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile("red.min" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+inline __device__ void atomic_max(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile("red.max" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+inline __device__ void atomic_inc(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  asm volatile("red.inc" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type) \
+inline __device__ void atomic_dec(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  asm volatile("red.dec" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+}
+
+// Group ops for integer types
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type)
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(double,".f64","d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(double,".f64","d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(uint32_t,".u32","r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int32_t,".s32","r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".s64","l")
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal
new file mode 100644
index 0000000000000000000000000000000000000000..ba8937883423e62b9461b6d764d24022531db719
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal
@@ -0,0 +1,88 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// Non Returning Atomic Operations
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+inline __device__ void atomic_add(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  if(__isGlobal(dest)) { \
+  asm volatile("red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+  } else { \
+  asm volatile("red.add"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+  } \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+inline __device__ void atomic_sub(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type neg_value = -value; \
+  if(__isGlobal(dest)) { \
+  asm volatile("red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(neg_value) : "memory"); \
+  } else { \
+  asm volatile("red.add"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(neg_value) : "memory"); \
+  } \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+inline __device__ void atomic_min(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  if(__isGlobal(dest)) { \
+  asm volatile("red.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+  } else { \
+  asm volatile("red.min"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+  } \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+inline __device__ void atomic_max(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  if(__isGlobal(dest)) { \
+  asm volatile("red.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+  } else { \
+  asm volatile("red.max"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+  } \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+inline __device__ void atomic_inc(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  if(__isGlobal(dest)) { \
+  asm volatile("red.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+  } else { \
+  asm volatile("red.inc"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+  } \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type) \
+inline __device__ void atomic_dec(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  if(__isGlobal(dest)) { \
+  asm volatile("red.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+  } else { \
+  asm volatile("red.dec"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+  } \
+}
+
+// Group ops for integer types
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type)
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(double,".f64","d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(double,".f64","d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(uint32_t,".u32","r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int32_t,".s32","r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".s64","l")
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate
new file mode 100644
index 0000000000000000000000000000000000000000..46e0ccf5e747e151039f68e17a72c82a05fc14fc
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate
@@ -0,0 +1,106 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// Non Returning Atomic Operations
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+inline __device__ void atomic_add(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %0;\n\t" \
+          "@p  red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "@!p red.add"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "}\n\t" \
+    :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+inline __device__ void atomic_sub(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type neg_value = -value; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %0;\n\t" \
+          "@p  red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "@!p red.add"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "}\n\t" \
+    :: "l"(dest),reg_type(neg_value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+inline __device__ void atomic_min(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %0;\n\t" \
+          "@p  red.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "@!p red.min"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "}\n\t" \
+    :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+inline __device__ void atomic_max(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %0;\n\t" \
+          "@p  red.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "@!p red.max"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "}\n\t" \
+    :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+inline __device__ void atomic_inc(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %0;\n\t" \
+          "@p  red.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "@!p red.inc"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "}\n\t" \
+    :: "l"(dest),reg_type(limit) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type) \
+inline __device__ void atomic_dec(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %0;\n\t" \
+          "@p  red.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "@!p red.dec"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "}\n\t" \
+    :: "l"(dest),reg_type(limit) : "memory"); \
+}
+
+// Group ops for integer types
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type)
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(double,".f64","d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(double,".f64","d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(uint32_t,".u32","r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int32_t,".s32","r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".s64","l")
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange.inc b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange.inc
new file mode 100644
index 0000000000000000000000000000000000000000..dfd211249fcc625733e1737d9f2258bf9d066ef9
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange.inc
@@ -0,0 +1,20 @@
+
+// Non returning atomic operation (ptx red instruction) only exists for relaxed and release memorder
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE MemoryScopeDevice
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".gpu"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE MemoryScopeNode
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".sys"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE MemoryScopeCore
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".cta"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM
+
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc
new file mode 100644
index 0000000000000000000000000000000000000000..7b4f7d094e8ac566213b6f11938dab721da003fa
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc
@@ -0,0 +1,27 @@
+
+// Non returning atomic operation (ptx red instruction) only exists for relaxed and release memorder
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderRelaxed
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".relaxed"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderRelease
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".release"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderAcquire
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".acquire"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderAcqRel
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".acq_rel"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc
new file mode 100644
index 0000000000000000000000000000000000000000..51d992087e35f8842fbc143f8e903273499d4507
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc
@@ -0,0 +1,40 @@
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_EXCHANGE() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_exchange(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.exch" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_exchange(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.exch" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_COMPARE_EXCHANGE() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_compare_exchange(ctype* dest, ctype compare, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_compare = reinterpret_cast<uint32_t&>(compare); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.cas" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2,%3;" : "=r"(asm_result) : "l"(dest),"r"(asm_compare),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_compare_exchange(ctype* dest, ctype compare, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_compare = reinterpret_cast<uint64_t&>(compare); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.cas" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2,%3;" : "=l"(asm_result) : "l"(dest),"l"(asm_compare),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_EXCHANGE()
+__DESUL_IMPL_CUDA_ASM_ATOMIC_COMPARE_EXCHANGE()
+
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_EXCHANGE
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_COMPARE_EXCHANGE
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_memorder.inc b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_memorder.inc
new file mode 100644
index 0000000000000000000000000000000000000000..3eb613d8a74c5bfc911bd51787dac9c4a972f698
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_memorder.inc
@@ -0,0 +1,29 @@
+
+// Non returning atomic operation (ptx red instruction) only exists for relaxed and release memorder
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderRelaxed
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".relaxed"
+#include "desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc"
+#include "desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderRelease
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".release"
+#include "desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc"
+#include "desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderAcquire
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".acquire"
+#include "desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderAcqRel
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".acq_rel"
+#include "desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
diff --git a/packages/kokkos/core/src/desul/atomics/openmp/OpenMP_40.hpp b/packages/kokkos/core/src/desul/atomics/openmp/OpenMP_40.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f4f1bbd96ee6b73d5ec15372256bb4177c033234
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/openmp/OpenMP_40.hpp
@@ -0,0 +1,97 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_OPENMP40_HPP_
+#define DESUL_ATOMICS_OPENMP40_HPP_
+#include<type_traits>
+
+namespace desul {
+namespace Impl {
+  template<class MEMORY_ORDER_TMP, class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_pre_capture_flush(MEMORY_ORDER_TMP, MEMORY_SCOPE_TMP) {}
+  template<class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_pre_capture_flush(MemoryOrderAcquire, MEMORY_SCOPE_TMP) {
+    atomic_thread_fence(MemoryOrderAcquire(), MEMORY_SCOPE_TMP());
+  }
+  template<class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_pre_capture_flush(MemoryOrderAcqRel, MEMORY_SCOPE_TMP) {
+    atomic_thread_fence(MemoryOrderAcqRel(), MEMORY_SCOPE_TMP());
+  }
+  template<class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_pre_capture_flush(MemoryOrderSeqCst, MEMORY_SCOPE_TMP) {
+    atomic_thread_fence(MemoryOrderSeqCst(), MEMORY_SCOPE_TMP());
+  }
+
+  template<class MEMORY_ORDER_TMP, class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_post_capture_flush(MEMORY_ORDER_TMP, MEMORY_SCOPE_TMP) {}
+  template<class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_post_capture_flush(MemoryOrderRelease, MEMORY_SCOPE_TMP) {
+    atomic_thread_fence(MemoryOrderRelease(), MEMORY_SCOPE_TMP());
+  }
+  template<class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_post_capture_flush(MemoryOrderAcqRel, MEMORY_SCOPE_TMP) {
+    atomic_thread_fence(MemoryOrderAcqRel(), MEMORY_SCOPE_TMP());
+  }
+  template<class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_post_capture_flush(MemoryOrderSeqCst, MEMORY_SCOPE_TMP) {
+    atomic_thread_fence(MemoryOrderSeqCst(), MEMORY_SCOPE_TMP());
+  }
+
+  template<class T>
+  struct is_openmp_atomic_type_t {
+    static constexpr bool value = std::is_arithmetic<T>::value;
+  };
+  template<class T>
+  constexpr bool is_openmp_atomic_type_v = is_openmp_atomic_type_t<T>::value;
+}
+}
+
+namespace desul {
+// Can't use a macro approach to get all definitions since the ops include #pragma omp
+// So gonna use multiple inclusion of the same code snippet here.
+
+// Can't do Node level atomics this way with OpenMP Target, but we could 
+// have a define which says whether or not Device level IS node level (e.g. for pure CPU node)
+
+#define MEMORY_ORDER MemoryOrderRelaxed
+// #define MEMORY_SCOPE MemoryScopeNode
+// #include<desul/atomics/openmp/OpenMP_40_op.inc>
+// #undef MEMORY_SCOPE
+#define MEMORY_SCOPE MemoryScopeDevice
+#include<desul/atomics/openmp/OpenMP_40_op.inc>
+#undef MEMORY_SCOPE
+#define MEMORY_SCOPE MemoryScopeCore
+#include<desul/atomics/openmp/OpenMP_40_op.inc>
+#undef MEMORY_SCOPE
+#undef MEMORY_ORDER
+
+#define MEMORY_ORDER MemoryOrderAcqRel
+// #define MEMORY_SCOPE MemoryScopeNode
+// #include<desul/atomics/openmp/OpenMP_40_op.inc>
+// #undef MEMORY_SCOPE
+#define MEMORY_SCOPE MemoryScopeDevice
+#include<desul/atomics/openmp/OpenMP_40_op.inc>
+#undef MEMORY_SCOPE
+#define MEMORY_SCOPE MemoryScopeCore
+#include<desul/atomics/openmp/OpenMP_40_op.inc>
+#undef MEMORY_SCOPE
+#undef MEMORY_ORDER
+
+#define MEMORY_ORDER MemoryOrderSeqCst
+// #define MEMORY_SCOPE MemoryScopeNode
+// #include<desul/atomics/openmp/OpenMP_40_op.inc>
+// #undef MEMORY_SCOPE
+#define MEMORY_SCOPE MemoryScopeDevice
+#include<desul/atomics/openmp/OpenMP_40_op.inc>
+#undef MEMORY_SCOPE
+#define MEMORY_SCOPE MemoryScopeCore
+#include<desul/atomics/openmp/OpenMP_40_op.inc>
+#undef MEMORY_SCOPE
+#undef MEMORY_ORDER
+}  // namespace desul
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/openmp/OpenMP_40_op.inc b/packages/kokkos/core/src/desul/atomics/openmp/OpenMP_40_op.inc
new file mode 100644
index 0000000000000000000000000000000000000000..a65f2a457dff8b2ec6e186411359e73b729fb5e1
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/openmp/OpenMP_40_op.inc
@@ -0,0 +1,101 @@
+
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_fetch_add(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());
+    #pragma omp atomic capture                                                    
+    { tmp = *dest;  *dest += value; }                                             
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_fetch_sub(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { tmp = *dest;  *dest -= value; }                                             
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_fetch_and(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    #pragma omp atomic capture                                                    
+    { tmp = *dest;  *dest &= value; }                                             
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_fetch_or(   
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { tmp = *dest;  *dest |= value; }                                             
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_fetch_xor(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { tmp = *dest;  *dest ^= value; }                                             
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_add_fetch(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { *dest += value; tmp = *dest; }                                              
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_sub_fetch(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { *dest -= value; tmp = *dest; }                                              
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_and_fetch(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { *dest &= value; tmp = *dest; }                                              
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_or_fetch(   
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { *dest |= value; tmp = *dest; }                                              
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_xor_fetch(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { *dest ^= value; tmp = *dest; }                                              
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }
diff --git a/packages/kokkos/core/src/desul/src/Lock_Array_CUDA.cpp b/packages/kokkos/core/src/desul/src/Lock_Array_CUDA.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8913f8bc7b80fc11844438f384582e7a036c824f
--- /dev/null
+++ b/packages/kokkos/core/src/desul/src/Lock_Array_CUDA.cpp
@@ -0,0 +1,98 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#include <desul/atomics/Lock_Array.hpp>
+#include <cinttypes>
+#include <string>
+#include <sstream>
+
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+#ifdef __CUDACC_RDC__
+namespace desul {
+namespace Impl {
+__device__ __constant__ int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE = nullptr;
+__device__ __constant__ int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE = nullptr;
+}
+}  // namespace desul
+#endif
+
+namespace desul {
+
+namespace {
+
+__global__ void init_lock_arrays_cuda_kernel() {
+  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < CUDA_SPACE_ATOMIC_MASK + 1) {
+    Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE[i] = 0;
+    Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE[i] = 0;
+  }
+}
+
+}  // namespace
+
+namespace Impl {
+
+
+int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
+int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
+
+// Putting this into anonymous namespace so we don't have multiple defined symbols
+// When linking in more than one copy of the object file
+namespace {
+
+void check_error_and_throw_cuda(cudaError e, const std::string msg) {
+  if(e != cudaSuccess) {
+    std::ostringstream out;
+    out << "Desul::Error: " << msg << " error(" << cudaGetErrorName(e)
+                  << "): " << cudaGetErrorString(e);
+    throw std::runtime_error(out.str());
+  }
+}
+
+}
+
+// define functions
+template<typename T>
+void init_lock_arrays_cuda() {
+  if (CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h != nullptr) return;
+  auto error_malloc1 = cudaMalloc(&CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h,
+                                 sizeof(int32_t) * (CUDA_SPACE_ATOMIC_MASK + 1));
+  check_error_and_throw_cuda(error_malloc1, "init_lock_arrays_cuda: cudaMalloc device locks");
+
+  auto error_malloc2 = cudaMallocHost(&CUDA_SPACE_ATOMIC_LOCKS_NODE_h,
+                                 sizeof(int32_t) * (CUDA_SPACE_ATOMIC_MASK + 1));
+  check_error_and_throw_cuda(error_malloc2, "init_lock_arrays_cuda: cudaMalloc host locks");
+
+  auto error_sync1 = cudaDeviceSynchronize();
+  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
+  check_error_and_throw_cuda(error_sync1, "init_lock_arrays_cuda: post mallocs");
+  init_lock_arrays_cuda_kernel<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>();
+  auto error_sync2 = cudaDeviceSynchronize();
+  check_error_and_throw_cuda(error_sync2, "init_lock_arrays_cuda: post init kernel");
+}
+
+template<typename T>
+void finalize_lock_arrays_cuda() {
+  if (CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h == nullptr) return;
+  cudaFree(CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h);
+  cudaFreeHost(CUDA_SPACE_ATOMIC_LOCKS_NODE_h);
+  CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
+  CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
+#ifdef __CUDACC_RDC__
+  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
+#endif
+}
+
+// Instantiate functions
+template void init_lock_arrays_cuda<int>();
+template void finalize_lock_arrays_cuda<int>();
+
+}  // namespace Impl
+
+}  // namespace desul
+#endif
diff --git a/packages/kokkos/core/src/desul/src/Lock_Array_HIP.cpp b/packages/kokkos/core/src/desul/src/Lock_Array_HIP.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..40030df643fa1e85a0fe433b0135386418590193
--- /dev/null
+++ b/packages/kokkos/core/src/desul/src/Lock_Array_HIP.cpp
@@ -0,0 +1,101 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#include <cinttypes>
+#include <desul/atomics/Lock_Array.hpp>
+#include <string>
+#include <sstream>
+
+#ifdef DESUL_HAVE_HIP_ATOMICS
+#ifdef DESUL_HIP_RDC
+namespace desul {
+namespace Impl {
+__device__ __constant__ int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE = nullptr;
+__device__ __constant__ int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE = nullptr;
+}  // namespace Impl
+}  // namespace desul
+#endif
+
+namespace desul {
+
+namespace {
+
+__global__ void init_lock_arrays_hip_kernel() {
+  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < HIP_SPACE_ATOMIC_MASK + 1) {
+    Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE[i] = 0;
+    Impl::HIP_SPACE_ATOMIC_LOCKS_NODE[i] = 0;
+  }
+}
+
+}  // namespace
+
+namespace Impl {
+
+int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
+int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
+
+// Putting this into anonymous namespace so we don't have multiple defined symbols
+// When linking in more than one copy of the object file
+namespace {
+
+void check_error_and_throw_hip(hipError_t e, const std::string msg) {
+  if(e != hipSuccess) {
+    std::ostringstream out;
+    out << "Desul::Error: " << msg << " error(" << hipGetErrorName(e)
+                  << "): " << hipGetErrorString(e);
+    throw std::runtime_error(out.str());
+  }
+}
+
+}
+
+template<typename T>
+void init_lock_arrays_hip() {
+  if (HIP_SPACE_ATOMIC_LOCKS_DEVICE_h != nullptr) return;
+
+  auto error_malloc1 = hipMalloc(&HIP_SPACE_ATOMIC_LOCKS_DEVICE_h,
+            sizeof(int32_t) * (HIP_SPACE_ATOMIC_MASK + 1));
+  check_error_and_throw_hip(error_malloc1, "init_lock_arrays_hip: hipMalloc device locks");
+
+  auto error_malloc2 = hipHostMalloc(&HIP_SPACE_ATOMIC_LOCKS_NODE_h,
+                sizeof(int32_t) * (HIP_SPACE_ATOMIC_MASK + 1));
+  check_error_and_throw_hip(error_malloc2, "init_lock_arrays_hip: hipMallocHost host locks");
+
+  auto error_sync1 = hipDeviceSynchronize();
+  DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE();
+  check_error_and_throw_hip(error_sync1, "init_lock_arrays_hip: post malloc");
+
+  init_lock_arrays_hip_kernel<<<(HIP_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>();
+
+  auto error_sync2 = hipDeviceSynchronize();
+  check_error_and_throw_hip(error_sync2, "init_lock_arrays_hip: post init");
+}
+
+template<typename T>
+void finalize_lock_arrays_hip() {
+  if (HIP_SPACE_ATOMIC_LOCKS_DEVICE_h == nullptr) return;
+  auto error_free1 = hipFree(HIP_SPACE_ATOMIC_LOCKS_DEVICE_h);
+  check_error_and_throw_hip(error_free1, "finalize_lock_arrays_hip: free device locks");
+  auto error_free2 = hipHostFree(HIP_SPACE_ATOMIC_LOCKS_NODE_h);
+  check_error_and_throw_hip(error_free2, "finalize_lock_arrays_hip: free host locks");
+  HIP_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
+  HIP_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
+#ifdef DESUL_HIP_RDC
+  DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE();
+#endif
+}
+
+template void init_lock_arrays_hip<int>();
+template void finalize_lock_arrays_hip<int>();
+
+}  // namespace Impl
+
+}  // namespace desul
+#endif
+
diff --git a/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp b/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp
index 7754daa8a0189a3d0708ce6505955be4b76b2d61..0ce680cd69efb1de0e9cbebadfda1f739e325630 100644
--- a/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp
+++ b/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp
@@ -52,6 +52,8 @@ class SYCLDeviceUSMSpace;  ///< Memory space on SYCL device, not accessible from
                            ///< the host
 class SYCLSharedUSMSpace;  ///< Memory space accessible from both the SYCL
                            ///< device and the host
+class SYCLHostUSMSpace;    ///< Memory space accessible from both the SYCL
+                           ///< device and the host (host pinned)
 class SYCL;                ///< Execution space for SYCL
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp b/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
index 7f72b3983f57c9adea157cf70d815339696cd986..5167c9ed65b42b1e567286849f37e89616e0e980 100644
--- a/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
+++ b/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
@@ -1518,28 +1518,14 @@ struct Tile_Loop_Type<
 };
 // end Structs for calling loops
 
-template <typename T>
-using is_void_type = std::is_same<T, void>;
-
-template <typename T>
-struct is_type_array : std::false_type {
-  using value_type = T;
-};
-
-template <typename T>
-struct is_type_array<T[]> : std::true_type {
-  using value_type = T;
-};
-
 template <typename RP, typename Functor, typename Tag = void,
           typename ValueType = void, typename Enable = void>
 struct HostIterateTile;
 
 // For ParallelFor
 template <typename RP, typename Functor, typename Tag, typename ValueType>
-struct HostIterateTile<
-    RP, Functor, Tag, ValueType,
-    typename std::enable_if<is_void_type<ValueType>::value>::type> {
+struct HostIterateTile<RP, Functor, Tag, ValueType,
+                       std::enable_if_t<std::is_void<ValueType>::value>> {
   using index_type = typename RP::index_type;
   using point_type = typename RP::point_type;
 
@@ -1947,10 +1933,9 @@ struct HostIterateTile<
 // For ParallelReduce
 // ValueType - scalar: For reductions
 template <typename RP, typename Functor, typename Tag, typename ValueType>
-struct HostIterateTile<
-    RP, Functor, Tag, ValueType,
-    typename std::enable_if<!is_void_type<ValueType>::value &&
-                            !is_type_array<ValueType>::value>::type> {
+struct HostIterateTile<RP, Functor, Tag, ValueType,
+                       std::enable_if_t<!std::is_void<ValueType>::value &&
+                                        !std::is_array<ValueType>::value>> {
   using index_type = typename RP::index_type;
   using point_type = typename RP::point_type;
 
@@ -2370,17 +2355,16 @@ struct HostIterateTile<
 // Extra specialization for array reductions
 // ValueType[]: For array reductions
 template <typename RP, typename Functor, typename Tag, typename ValueType>
-struct HostIterateTile<
-    RP, Functor, Tag, ValueType,
-    typename std::enable_if<!is_void_type<ValueType>::value &&
-                            is_type_array<ValueType>::value>::type> {
+struct HostIterateTile<RP, Functor, Tag, ValueType,
+                       std::enable_if_t<!std::is_void<ValueType>::value &&
+                                        std::is_array<ValueType>::value>> {
   using index_type = typename RP::index_type;
   using point_type = typename RP::point_type;
 
   using value_type =
-      typename is_type_array<ValueType>::value_type;  // strip away the
-                                                      // 'array-ness' [], only
-                                                      // underlying type remains
+      std::remove_extent_t<ValueType>;  // strip away the
+                                        // 'array-ness' [], only
+                                        // underlying type remains
 
   inline HostIterateTile(
       RP const& rp, Functor const& func,
diff --git a/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp b/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
index c513817b5b8cbd74847e180099081bb475020c44..20fc6268c7dcd30c5c8deac332c61be673c7cc3d 100644
--- a/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
@@ -63,12 +63,39 @@
 namespace Kokkos {
 namespace Impl {
 
+//==============================================================================
+// <editor-fold desc="AnalyzePolicyBaseTraits"> {{{1
+
+// Mix in the defaults (base_traits) for the traits that aren't yet handled
+
 //------------------------------------------------------------------------------
+// <editor-fold desc="MSVC EBO failure workaround"> {{{2
+
+template <class TraitSpecList>
+struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION AnalyzeExecPolicyBaseTraits;
+template <class... TraitSpecifications>
+struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION
+    AnalyzeExecPolicyBaseTraits<type_list<TraitSpecifications...>>
+    : TraitSpecifications::base_traits... {};
+
+// </editor-fold> end AnalyzePolicyBaseTraits }}}1
+//==============================================================================
 
-using execution_policy_trait_specifications =
-    type_list<ExecutionSpaceTrait, GraphKernelTrait, IndexTypeTrait,
-              IterationPatternTrait, LaunchBoundsTrait, OccupancyControlTrait,
-              ScheduleTrait, WorkItemPropertyTrait, WorkTagTrait>;
+//==============================================================================
+// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+
+//------------------------------------------------------------------------------
+// Note: unspecialized, so that the default pathway is to fall back to using
+// the PolicyTraitMatcher. See AnalyzeExecPolicyUseMatcher below
+template <class Enable, class... Traits>
+struct AnalyzeExecPolicy
+    : AnalyzeExecPolicyUseMatcher<void, execution_policy_trait_specifications,
+                                  Traits...> {
+  using base_t =
+      AnalyzeExecPolicyUseMatcher<void, execution_policy_trait_specifications,
+                                  Traits...>;
+  using base_t::base_t;
+};
 
 //------------------------------------------------------------------------------
 // Ignore void for backwards compatibility purposes, though hopefully no one is
@@ -81,15 +108,6 @@ struct AnalyzeExecPolicy<void, void, Traits...>
 };
 
 //------------------------------------------------------------------------------
-// Mix in the defaults (base_traits) for the traits that aren't yet handled
-
-template <class TraitSpecList>
-struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION AnalyzeExecPolicyBaseTraits;
-template <class... TraitSpecifications>
-struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION
-    AnalyzeExecPolicyBaseTraits<type_list<TraitSpecifications...>>
-    : TraitSpecifications::base_traits... {};
-
 template <>
 struct AnalyzeExecPolicy<void>
     : AnalyzeExecPolicyBaseTraits<execution_policy_trait_specifications> {
@@ -108,6 +126,68 @@ struct AnalyzeExecPolicy<void>
   }
 };
 
+// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+//==============================================================================
+
+//==============================================================================
+// <editor-fold desc="AnalyzeExecPolicyUseMatcher"> {{{1
+
+// We can avoid having to have policies specialize AnalyzeExecPolicy themselves
+// by piggy-backing off of the PolicyTraitMatcher that we need to have for
+// things like require() anyway. We mixin the effects of the trait using
+// the `mixin_matching_trait` nested alias template in the trait specification
+
+// General PolicyTraitMatcher version
+
+// Matching case
+template <class TraitSpec, class... TraitSpecs, class Trait, class... Traits>
+struct AnalyzeExecPolicyUseMatcher<
+    std::enable_if_t<PolicyTraitMatcher<TraitSpec, Trait>::value>,
+    type_list<TraitSpec, TraitSpecs...>, Trait, Traits...>
+    : TraitSpec::template mixin_matching_trait<
+          Trait, AnalyzeExecPolicy<void, Traits...>> {
+  using base_t = typename TraitSpec::template mixin_matching_trait<
+      Trait, AnalyzeExecPolicy<void, Traits...>>;
+  using base_t::base_t;
+};
+
+// Non-matching case
+template <class TraitSpec, class... TraitSpecs, class Trait, class... Traits>
+struct AnalyzeExecPolicyUseMatcher<
+    std::enable_if_t<!PolicyTraitMatcher<TraitSpec, Trait>::value>,
+    type_list<TraitSpec, TraitSpecs...>, Trait, Traits...>
+    : AnalyzeExecPolicyUseMatcher<void, type_list<TraitSpecs...>, Trait,
+                                  Traits...> {
+  using base_t = AnalyzeExecPolicyUseMatcher<void, type_list<TraitSpecs...>,
+                                             Trait, Traits...>;
+  using base_t::base_t;
+};
+
+// No match found case:
+template <class>
+struct show_name_of_invalid_execution_policy_trait;
+template <class Trait, class... Traits>
+struct AnalyzeExecPolicyUseMatcher<void, type_list<>, Trait, Traits...> {
+  static constexpr auto trigger_error_message =
+      show_name_of_invalid_execution_policy_trait<Trait>{};
+  static_assert(
+      /* always false: */ std::is_void<Trait>::value,
+      "Unknown execution policy trait. Search compiler output for "
+      "'show_name_of_invalid_execution_policy_trait' to see the type of the "
+      "invalid trait.");
+};
+
+// All traits matched case:
+template <>
+struct AnalyzeExecPolicyUseMatcher<void, type_list<>>
+    : AnalyzeExecPolicy<void> {
+  using base_t = AnalyzeExecPolicy<void>;
+  using base_t::base_t;
+};
+
+// </editor-fold> end AnalyzeExecPolicyUseMatcher }}}1
+//==============================================================================
+
 //------------------------------------------------------------------------------
 // Used for defaults that depend on other analysis results
 template <class AnalysisResults>
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
index dd571eb6d72e23bf0d028493fbeec42f645b382b..d481a8dc0f21efa675e0b181a8c6981e1f9afce6 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
@@ -51,10 +51,6 @@
     !defined(KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP)
 #define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
-#endif
-
 #include <impl/Kokkos_Atomic_Memory_Order.hpp>
 #include <impl/Kokkos_Memory_Fence.hpp>
 
@@ -115,13 +111,9 @@ __inline__ __device__ T atomic_compare_exchange(
                             const T>::type& val) {
   T return_val;
   // This is a way to (hopefully) avoid dead lock in a warp
-  int done = 0;
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  unsigned int mask   = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+  int done                 = 0;
+  unsigned int mask        = __activemask();
+  unsigned int active      = __ballot_sync(mask, 1);
   unsigned int done_active = 0;
   while (active != done_active) {
     if (!done) {
@@ -134,11 +126,7 @@ __inline__ __device__ T atomic_compare_exchange(
         done = 1;
       }
     }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done);
-#else
-    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
-#endif
+    done_active = __ballot_sync(mask, done);
   }
   return return_val;
 }
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp
index bbea3c99b8fcfbf5c25132c913f92b7337001806..4bb8b4fd52af0c8beaf8c4dfadddfa7be58c5c54 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp
@@ -51,10 +51,6 @@
 #ifndef KOKKOS_ATOMIC_COMPARE_EXCHANGE_WEAK_HPP
 #define KOKKOS_ATOMIC_COMPARE_EXCHANGE_WEAK_HPP
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
-#endif
-
 namespace Kokkos {
 
 //----------------------------------------------------------------------------
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
index f2c1c756a910d26de0eb3765e0b90684e564d243..cd840983d8a3bd24cace6e411cabc940d44ddfe1 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
@@ -50,10 +50,6 @@
 #if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_EXCHANGE_HPP)
 #define KOKKOS_ATOMIC_EXCHANGE_HPP
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
-#endif
-
 namespace Kokkos {
 
 //----------------------------------------------------------------------------
@@ -122,13 +118,9 @@ atomic_exchange(volatile T* const dest,
   _mm_prefetch((const char*)dest, _MM_HINT_ET0);
 #endif
 
-  int done = 0;
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  unsigned int mask   = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+  int done                 = 0;
+  unsigned int mask        = __activemask();
+  unsigned int active      = __ballot_sync(mask, 1);
   unsigned int done_active = 0;
   while (active != done_active) {
     if (!done) {
@@ -141,11 +133,7 @@ atomic_exchange(volatile T* const dest,
         done = 1;
       }
     }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done);
-#else
-    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
-#endif
+    done_active = __ballot_sync(mask, done);
   }
   return return_val;
 }
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
index 5c3f825ed100450bac57110829f64094b782011d..9a2b13debc70f24bf6adb34ddee13815458245b3 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
@@ -50,10 +50,6 @@
 #if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_FETCH_ADD_HPP)
 #define KOKKOS_ATOMIC_FETCH_ADD_HPP
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
-#endif
-
 namespace Kokkos {
 
 //----------------------------------------------------------------------------
@@ -148,13 +144,9 @@ atomic_fetch_add(volatile T* const dest,
                                          const T>::type& val) {
   T return_val;
   // This is a way to (hopefully) avoid dead lock in a warp
-  int done = 0;
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  unsigned int mask   = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+  int done                 = 0;
+  unsigned int mask        = __activemask();
+  unsigned int active      = __ballot_sync(mask, 1);
   unsigned int done_active = 0;
   while (active != done_active) {
     if (!done) {
@@ -169,11 +161,7 @@ atomic_fetch_add(volatile T* const dest,
       }
     }
 
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done);
-#else
-    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
-#endif
+    done_active = __ballot_sync(mask, done);
   }
   return return_val;
 }
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
index c3446ae6a3bda89fac094ab9693688c2be9f77a5..148ed974420ff88d2831a2ac98ac70a8ea5f4bf2 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
@@ -50,10 +50,6 @@
 #if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_FETCH_SUB_HPP)
 #define KOKKOS_ATOMIC_FETCH_SUB_HPP
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
-#endif
-
 namespace Kokkos {
 
 //----------------------------------------------------------------------------
@@ -143,13 +139,9 @@ atomic_fetch_sub(volatile T* const dest,
                                          const T>::type& val) {
   T return_val;
   // This is a way to (hopefully) avoid dead lock in a warp
-  int done = 0;
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  unsigned int mask   = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+  int done                 = 0;
+  unsigned int mask        = __activemask();
+  unsigned int active      = __ballot_sync(mask, 1);
   unsigned int done_active = 0;
   while (active != done_active) {
     if (!done) {
@@ -162,11 +154,7 @@ atomic_fetch_sub(volatile T* const dest,
         done = 1;
       }
     }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done);
-#else
-    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
-#endif
+    done_active = __ballot_sync(mask, done);
   }
   return return_val;
 }
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
index 28ac7a3bab9e748f9d315ca479f57db885ed75c4..f6bdbca729a335e4218ec3ac9108f0c3046eac05 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
@@ -47,10 +47,6 @@
 #define KOKKOS_ATOMIC_GENERIC_HPP
 #include <Kokkos_Macros.hpp>
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
-#endif
-
 // Combination operands to be used in an Compare and Exchange based atomic
 // operation
 namespace Kokkos {
@@ -301,12 +297,8 @@ KOKKOS_INLINE_FUNCTION T atomic_fetch_oper(
   // This is a way to (hopefully) avoid dead lock in a warp
   T return_val;
   int done                 = 0;
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  unsigned int mask        = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-  unsigned int active      = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+  unsigned int mask        = __activemask();
+  unsigned int active      = __ballot_sync(mask, 1);
   unsigned int done_active = 0;
   while (active != done_active) {
     if (!done) {
@@ -319,11 +311,7 @@ KOKKOS_INLINE_FUNCTION T atomic_fetch_oper(
         done = 1;
       }
     }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done);
-#else
-    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
-#endif
+    done_active = __ballot_sync(mask, done);
   }
   return return_val;
 #elif defined(__HIP_DEVICE_COMPILE__)
@@ -377,12 +365,8 @@ atomic_oper_fetch(const Oper& op, volatile T* const dest,
   T return_val;
   // This is a way to (hopefully) avoid dead lock in a warp
   int done                 = 0;
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  unsigned int mask        = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-  unsigned int active      = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+  unsigned int mask        = __activemask();
+  unsigned int active      = __ballot_sync(mask, 1);
   unsigned int done_active = 0;
   while (active != done_active) {
     if (!done) {
@@ -395,11 +379,7 @@ atomic_oper_fetch(const Oper& op, volatile T* const dest,
         done = 1;
       }
     }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done);
-#else
-    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
-#endif
+    done_active = __ballot_sync(mask, done);
   }
   return return_val;
 #elif defined(__HIP_DEVICE_COMPILE__)
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
index 975318b7dde67a1d1569c3cf657060c3ae18215d..f763f8c7916e4875e6c1b2a6c3733c89c532f6bb 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
@@ -339,9 +339,8 @@ class AtomicDataElement {
   }
 
   KOKKOS_INLINE_FUNCTION
-  operator volatile non_const_value_type() volatile const {
-    // return Kokkos::atomic_load(ptr);
-    return *ptr;
+  operator non_const_value_type() volatile const {
+    return Kokkos::Impl::atomic_load(ptr);
   }
 };
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp b/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp
index 4e46b8d157f83129182d4db9b725bcddbe3ed28b..87f18604da52a62c6c8de22e8c670169ceec643a 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp
@@ -55,7 +55,7 @@
 // To use OpenCL(TM) built-in intrinsics inside kernels, we have to
 // forward-declare their prototype, also see
 // https://github.com/intel/pti-gpu/blob/master/chapters/binary_instrumentation/OpenCLBuiltIn.md
-#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GEN) && \
+#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) && \
     defined(__SYCL_DEVICE_ONLY__)
 extern SYCL_EXTERNAL unsigned long __attribute__((overloadable))
 intel_get_cycle_counter();
@@ -85,7 +85,7 @@ uint64_t clock_tic() noexcept {
 
   return clock64();
 
-#elif defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GEN) && \
+#elif defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) && \
     defined(__SYCL_DEVICE_ONLY__)
   return intel_get_cycle_counter();
 #elif defined(KOKKOS_ENABLE_OPENMPTARGET)
diff --git a/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp b/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp
index 06681a95ae902c613c701cd78ff572d35da6c0a1..4ec8513191f21e07896bac21274e3af088dfe518 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp
@@ -76,17 +76,17 @@ struct CombinedReducerValueItemImpl {
       CombinedReducerValueItemImpl const&) = default;
   KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerValueItemImpl(
       CombinedReducerValueItemImpl&&) = default;
-  KOKKOS_DEFAULTED_FUNCTION KOKKOS_CONSTEXPR_14 CombinedReducerValueItemImpl&
-  operator=(CombinedReducerValueItemImpl const&) = default;
-  KOKKOS_DEFAULTED_FUNCTION KOKKOS_CONSTEXPR_14 CombinedReducerValueItemImpl&
-  operator=(CombinedReducerValueItemImpl&&) = default;
+  KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerValueItemImpl& operator=(
+      CombinedReducerValueItemImpl const&) = default;
+  KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerValueItemImpl& operator=(
+      CombinedReducerValueItemImpl&&) = default;
   KOKKOS_DEFAULTED_FUNCTION
   ~CombinedReducerValueItemImpl() = default;
   explicit KOKKOS_FUNCTION CombinedReducerValueItemImpl(value_type arg_value)
       : m_value(std::move(arg_value)) {}
 
   KOKKOS_FORCEINLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14 value_type& ref() & noexcept { return m_value; }
+  constexpr value_type& ref() & noexcept { return m_value; }
   KOKKOS_FORCEINLINE_FUNCTION
   constexpr value_type const& ref() const& noexcept { return m_value; }
   KOKKOS_FORCEINLINE_FUNCTION
@@ -112,11 +112,11 @@ struct CombinedReducerValueImpl<std::integer_sequence<size_t, Idxs...>,
   KOKKOS_DEFAULTED_FUNCTION
   constexpr CombinedReducerValueImpl(CombinedReducerValueImpl&&) = default;
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14 CombinedReducerValueImpl& operator=(
+  constexpr CombinedReducerValueImpl& operator=(
       CombinedReducerValueImpl const&) = default;
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14 CombinedReducerValueImpl& operator=(
-      CombinedReducerValueImpl&&) = default;
+  constexpr CombinedReducerValueImpl& operator=(CombinedReducerValueImpl&&) =
+      default;
   KOKKOS_DEFAULTED_FUNCTION
   ~CombinedReducerValueImpl() = default;
 
@@ -165,20 +165,19 @@ struct CombinedReducerStorageImpl {
   // model Reducer
 
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14 _fold_comma_emulation_return
-  _init(value_type& val) const {
+  constexpr _fold_comma_emulation_return _init(value_type& val) const {
     m_reducer.init(val);
     return _fold_comma_emulation_return{};
   }
 
-  KOKKOS_INLINE_FUNCTION KOKKOS_CONSTEXPR_14 _fold_comma_emulation_return
-  _join(value_type& dest, value_type const& src) const {
+  KOKKOS_INLINE_FUNCTION constexpr _fold_comma_emulation_return _join(
+      value_type& dest, value_type const& src) const {
     m_reducer.join(dest, src);
     return _fold_comma_emulation_return{};
   }
 
-  KOKKOS_INLINE_FUNCTION KOKKOS_CONSTEXPR_14 _fold_comma_emulation_return
-  _join(value_type volatile& dest, value_type const volatile& src) const {
+  KOKKOS_INLINE_FUNCTION constexpr _fold_comma_emulation_return _join(
+      value_type volatile& dest, value_type const volatile& src) const {
     m_reducer.join(dest, src);
     return _fold_comma_emulation_return{};
   }
@@ -242,10 +241,10 @@ struct CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, Space,
   KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl(
       CombinedReducerImpl const&) = default;
   KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl(
-      CombinedReducerImpl&&) = default;
-  KOKKOS_DEFAULTED_FUNCTION KOKKOS_CONSTEXPR_14 CombinedReducerImpl& operator=(
+      CombinedReducerImpl&&)                                       = default;
+  KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl& operator=(
       CombinedReducerImpl const&) = default;
-  KOKKOS_DEFAULTED_FUNCTION KOKKOS_CONSTEXPR_14 CombinedReducerImpl& operator=(
+  KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl& operator=(
       CombinedReducerImpl&&) = default;
 
   KOKKOS_DEFAULTED_FUNCTION ~CombinedReducerImpl() = default;
@@ -257,9 +256,8 @@ struct CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, Space,
                                                        reducers)...,
         m_value_view(&value) {}
 
-  KOKKOS_FUNCTION KOKKOS_CONSTEXPR_14 void join(value_type& dest,
-                                                value_type const& src) const
-      noexcept {
+  KOKKOS_FUNCTION constexpr void join(value_type& dest,
+                                      value_type const& src) const noexcept {
     emulate_fold_comma_operator(
         this->CombinedReducerStorageImpl<Idxs, Reducers>::_join(
             dest.template get<Idxs, typename Reducers::value_type>(),
@@ -274,8 +272,7 @@ struct CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, Space,
             src.template get<Idxs, typename Reducers::value_type>())...);
   }
 
-  KOKKOS_FUNCTION KOKKOS_CONSTEXPR_14 void init(value_type& dest) const
-      noexcept {
+  KOKKOS_FUNCTION constexpr void init(value_type& dest) const noexcept {
     emulate_fold_comma_operator(
         this->CombinedReducerStorageImpl<Idxs, Reducers>::_init(
             dest.template get<Idxs, typename Reducers::value_type>())...);
@@ -298,7 +295,7 @@ struct CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, Space,
   }
 
   KOKKOS_FUNCTION
-  KOKKOS_CONSTEXPR_14 static void write_value_back_to_original_references(
+  constexpr static void write_value_back_to_original_references(
       value_type const& value,
       Reducers const&... reducers_that_reference_original_values) noexcept {
     emulate_fold_comma_operator(
@@ -360,10 +357,10 @@ struct CombinedReductionFunctorWrapperImpl<
   constexpr CombinedReductionFunctorWrapperImpl(
       CombinedReductionFunctorWrapperImpl&&) = default;
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14 CombinedReductionFunctorWrapperImpl& operator=(
+  constexpr CombinedReductionFunctorWrapperImpl& operator=(
       CombinedReductionFunctorWrapperImpl const&) = default;
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14 CombinedReductionFunctorWrapperImpl& operator=(
+  constexpr CombinedReductionFunctorWrapperImpl& operator=(
       CombinedReductionFunctorWrapperImpl&&) = default;
   KOKKOS_DEFAULTED_FUNCTION
   ~CombinedReductionFunctorWrapperImpl() = default;
@@ -551,7 +548,7 @@ auto parallel_reduce(std::string const& label, PolicyType const& policy,
                      ReturnType2&& returnType2,
                      ReturnTypes&&... returnTypes) noexcept ->
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<PolicyType>::value>::type {
+        Kokkos::is_execution_policy<PolicyType>::value>::type {
   //----------------------------------------
   // Since we don't support asynchronous combined reducers yet for various
   // reasons, we actually just want to work with the pointers and references
@@ -581,8 +578,11 @@ auto parallel_reduce(std::string const& label, PolicyType const& policy,
 
   reduce_adaptor_t::execute(label, policy, combined_functor, combined_reducer);
   Impl::ParallelReduceFence<typename PolicyType::execution_space,
-                            combined_reducer_type>::fence(policy.space(),
-                                                          combined_reducer);
+                            combined_reducer_type>::
+      fence(
+          policy.space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          combined_reducer);
   combined_reducer.write_value_back_to_original_references(
       value, Impl::_make_reducer_from_arg<space_type>(returnType1),
       Impl::_make_reducer_from_arg<space_type>(returnType2),
@@ -596,7 +596,7 @@ auto parallel_reduce(PolicyType const& policy, Functor const& functor,
                      ReturnType1&& returnType1, ReturnType2&& returnType2,
                      ReturnTypes&&... returnTypes) noexcept ->
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<PolicyType>::value>::type {
+        Kokkos::is_execution_policy<PolicyType>::value>::type {
   //----------------------------------------
   Kokkos::parallel_reduce("", policy, functor,
                           std::forward<ReturnType1>(returnType1),
diff --git a/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp b/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp
index c02f4acddacb41f5fb01c50536f6a426738fac99..dafe57f8da71cd22ea09a4e93a84f3196b24ca5c 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp
@@ -138,15 +138,15 @@ struct concurrent_bitset {
     // when is full at the atomic_fetch_add(+1)
     // then a release occurs before the atomic_fetch_add(-1).
 
-    const uint32_t state =
-        (uint32_t)Kokkos::atomic_fetch_add((volatile int *)buffer, 1);
+    const uint32_t state = (uint32_t)Kokkos::atomic_fetch_add(
+        reinterpret_cast<volatile int *>(buffer), 1);
 
     const uint32_t state_error = state_header != (state & state_header_mask);
 
     const uint32_t state_bit_used = state & state_used_mask;
 
     if (state_error || (bit_bound <= state_bit_used)) {
-      Kokkos::atomic_fetch_add((volatile int *)buffer, -1);
+      Kokkos::atomic_fetch_add(reinterpret_cast<volatile int *>(buffer), -1);
       return state_error ? type(-2, -2) : type(-1, -1);
     }
 
@@ -222,15 +222,15 @@ struct concurrent_bitset {
     // when is full at the atomic_fetch_add(+1)
     // then a release occurs before the atomic_fetch_add(-1).
 
-    const uint32_t state =
-        (uint32_t)Kokkos::atomic_fetch_add((volatile int *)buffer, 1);
+    const uint32_t state = (uint32_t)Kokkos::atomic_fetch_add(
+        reinterpret_cast<volatile int *>(buffer), 1);
 
     const uint32_t state_error = state_header != (state & state_header_mask);
 
     const uint32_t state_bit_used = state & state_used_mask;
 
     if (state_error || (bit_bound <= state_bit_used)) {
-      Kokkos::atomic_fetch_add((volatile int *)buffer, -1);
+      Kokkos::atomic_fetch_add(reinterpret_cast<volatile int *>(buffer), -1);
       return state_error ? type(-2, -2) : type(-1, -1);
     }
 
@@ -300,7 +300,8 @@ struct concurrent_bitset {
     // Do not update count until bit clear is visible
     Kokkos::memory_fence();
 
-    const int count = Kokkos::atomic_fetch_add((volatile int *)buffer, -1);
+    const int count =
+        Kokkos::atomic_fetch_add(reinterpret_cast<volatile int *>(buffer), -1);
 
     // Flush the store-release
     Kokkos::memory_fence();
@@ -336,7 +337,8 @@ struct concurrent_bitset {
     // Do not update count until bit clear is visible
     Kokkos::memory_fence();
 
-    const int count = Kokkos::atomic_fetch_add((volatile int *)buffer, -1);
+    const int count =
+        Kokkos::atomic_fetch_add(reinterpret_cast<volatile int *>(buffer), -1);
 
     return (count & state_used_mask) - 1;
   }
diff --git a/packages/kokkos/core/src/impl/Kokkos_Core.cpp b/packages/kokkos/core/src/impl/Kokkos_Core.cpp
index b4769fbeaa53be8353df315ede634708da1b297d..a1f9d336329fff0426863f28f493854e7f8091f3 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Core.cpp
@@ -130,6 +130,11 @@ void ExecSpaceManager::static_fence() {
     to_fence.second->fence();
   }
 }
+void ExecSpaceManager::static_fence(const std::string& name) {
+  for (auto& to_fence : exec_space_factory_list) {
+    to_fence.second->fence(name);
+  }
+}
 void ExecSpaceManager::print_configuration(std::ostream& msg,
                                            const bool detail) {
   for (auto& to_print : exec_space_factory_list) {
@@ -506,11 +511,6 @@ void pre_initialize_internal(const InitArguments& args) {
   declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "yes");
 #else
   declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "no");
-#endif
-#ifdef KOKKOS_ENABLE_MPI
-  declare_configuration_metadata("options", "KOKKOS_ENABLE_MPI", "yes");
-#else
-  declare_configuration_metadata("options", "KOKKOS_ENABLE_MPI", "no");
 #endif
   declare_configuration_metadata("architecture", "Default Device",
                                  typeid(Kokkos::DefaultExecutionSpace).name());
@@ -564,7 +564,9 @@ void finalize_internal(const bool all_spaces = false) {
   g_tune_internals = false;
 }
 
-void fence_internal() { Impl::ExecSpaceManager::get_instance().static_fence(); }
+void fence_internal(const std::string& name) {
+  Impl::ExecSpaceManager::get_instance().static_fence(name);
+}
 
 bool check_arg(char const* arg, char const* expected) {
   std::size_t arg_len = std::strlen(arg);
@@ -1092,7 +1094,8 @@ void finalize_all() {
   Impl::finalize_internal(all_spaces);
 }
 
-void fence() { Impl::fence_internal(); }
+void fence() { Impl::fence_internal("Kokkos::fence: Unnamed Global Fence"); }
+void fence(const std::string& name) { Impl::fence_internal(name); }
 
 void print_helper(std::ostringstream& out,
                   const std::map<std::string, std::string>& print_me) {
diff --git a/packages/kokkos/core/src/impl/Kokkos_EBO.hpp b/packages/kokkos/core/src/impl/Kokkos_EBO.hpp
index a124511c07e2fcb50c8392e56f7c5393262a3af7..dc8e5e4d830623b4fa794966da4b724467e181dc 100644
--- a/packages/kokkos/core/src/impl/Kokkos_EBO.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_EBO.hpp
@@ -79,20 +79,6 @@ struct EBOBaseImpl;
 
 template <class T, template <class...> class CtorNotOnDevice>
 struct EBOBaseImpl<T, true, CtorNotOnDevice> {
-  /*
-   * Workaround for constexpr in C++11: we need to still call T(args...), but we
-   * can't do so in the body of a constexpr function (in C++11), and there's no
-   * data member to construct into. But we can construct into an argument
-   * of a delegating constructor...
-   */
-  // TODO @minor DSH the destructor gets called too early with this workaround
-  struct _constexpr_14_workaround_tag {};
-  struct _constexpr_14_workaround_no_device_tag {};
-  KOKKOS_FORCEINLINE_FUNCTION
-  constexpr EBOBaseImpl(_constexpr_14_workaround_tag, T&&) noexcept {}
-  inline constexpr EBOBaseImpl(_constexpr_14_workaround_no_device_tag,
-                               T&&) noexcept {}
-
   template <
       class... Args, class _ignored = void,
       typename std::enable_if<std::is_void<_ignored>::value &&
@@ -100,10 +86,7 @@ struct EBOBaseImpl<T, true, CtorNotOnDevice> {
                                   !CtorNotOnDevice<Args...>::value,
                               int>::type = 0>
   KOKKOS_FORCEINLINE_FUNCTION constexpr explicit EBOBaseImpl(
-      Args&&... args) noexcept(noexcept(T(std::forward<Args>(args)...)))
-      // still call the constructor
-      : EBOBaseImpl(_constexpr_14_workaround_tag{},
-                    T(std::forward<Args>(args)...)) {}
+      Args&&...) noexcept {}
 
   template <
       class... Args, class _ignored = void,
@@ -111,11 +94,7 @@ struct EBOBaseImpl<T, true, CtorNotOnDevice> {
                                   std::is_constructible<T, Args...>::value &&
                                   CtorNotOnDevice<Args...>::value,
                               long>::type = 0>
-  inline constexpr explicit EBOBaseImpl(Args&&... args) noexcept(
-      noexcept(T(std::forward<Args>(args)...)))
-      // still call the constructor
-      : EBOBaseImpl(_constexpr_14_workaround_no_device_tag{},
-                    T(std::forward<Args>(args)...)) {}
+  inline constexpr explicit EBOBaseImpl(Args&&...) noexcept {}
 
   KOKKOS_DEFAULTED_FUNCTION
   constexpr EBOBaseImpl(EBOBaseImpl const&) = default;
@@ -124,19 +103,16 @@ struct EBOBaseImpl<T, true, CtorNotOnDevice> {
   constexpr EBOBaseImpl(EBOBaseImpl&&) = default;
 
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  EBOBaseImpl& operator=(EBOBaseImpl const&) = default;
+  constexpr EBOBaseImpl& operator=(EBOBaseImpl const&) = default;
 
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  EBOBaseImpl& operator=(EBOBaseImpl&&) = default;
+  constexpr EBOBaseImpl& operator=(EBOBaseImpl&&) = default;
 
   KOKKOS_DEFAULTED_FUNCTION
   ~EBOBaseImpl() = default;
 
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  T& _ebo_data_member() & { return *reinterpret_cast<T*>(this); }
+  constexpr T& _ebo_data_member() & { return *reinterpret_cast<T*>(this); }
 
   KOKKOS_INLINE_FUNCTION
   constexpr T const& _ebo_data_member() const& {
@@ -154,8 +130,9 @@ struct EBOBaseImpl<T, true, CtorNotOnDevice> {
   }
 
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  T&& _ebo_data_member() && { return std::move(*reinterpret_cast<T*>(this)); }
+  constexpr T&& _ebo_data_member() && {
+    return std::move(*reinterpret_cast<T*>(this));
+  }
 };
 
 template <class T, template <class...> class CTorsNotOnDevice>
@@ -191,12 +168,10 @@ struct EBOBaseImpl<T, false, CTorsNotOnDevice> {
   constexpr EBOBaseImpl(EBOBaseImpl&&) noexcept = default;
 
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  EBOBaseImpl& operator=(EBOBaseImpl const&) = default;
+  constexpr EBOBaseImpl& operator=(EBOBaseImpl const&) = default;
 
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  EBOBaseImpl& operator=(EBOBaseImpl&&) = default;
+  constexpr EBOBaseImpl& operator=(EBOBaseImpl&&) = default;
 
   KOKKOS_DEFAULTED_FUNCTION
   ~EBOBaseImpl() = default;
@@ -232,8 +207,7 @@ struct StandardLayoutNoUniqueAddressMemberEmulation
   using ebo_base_t::ebo_base_t;
 
   KOKKOS_FORCEINLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  T& no_unique_address_data_member() & {
+  constexpr T& no_unique_address_data_member() & {
     return this->ebo_base_t::_ebo_data_member();
   }
 
@@ -253,8 +227,7 @@ struct StandardLayoutNoUniqueAddressMemberEmulation
   }
 
   KOKKOS_FORCEINLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  T&& no_unique_address_data_member() && {
+  constexpr T&& no_unique_address_data_member() && {
     return this->ebo_base_t::_ebo_data_member();
   }
 };
diff --git a/packages/kokkos/core/src/impl/Kokkos_Error.cpp b/packages/kokkos/core/src/impl/Kokkos_Error.cpp
index dfb9f3a51cdbd9aa7e189e21f5956806d53823b5..9c8024cbd03ee9230b1ed27468c7cb82aadc5d97 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Error.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Error.cpp
@@ -138,6 +138,9 @@ void Experimental::RawMemoryAllocationFailure::print_error_message(
     case AllocationMechanism::SYCLMallocShared:
       o << "sycl::malloc_shared().";
       break;
+    case AllocationMechanism::SYCLMallocHost:
+      o << "sycl::malloc_host().";
+      break;
   }
   append_additional_error_information(o);
   o << ")" << std::endl;
diff --git a/packages/kokkos/core/src/impl/Kokkos_Error.hpp b/packages/kokkos/core/src/impl/Kokkos_Error.hpp
index 5db459734631ddff5d0a29963a9ec04b9ec549ea..dc9bfe2b5a9e0eb66dc2a6ae43fd296726e7a458 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Error.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Error.hpp
@@ -97,7 +97,8 @@ class RawMemoryAllocationFailure : public std::bad_alloc {
     HIPMalloc,
     HIPHostMalloc,
     SYCLMallocDevice,
-    SYCLMallocShared
+    SYCLMallocShared,
+    SYCLMallocHost
   };
 
  private:
@@ -218,31 +219,41 @@ KOKKOS_IMPL_ABORT_NORETURN KOKKOS_INLINE_FUNCTION void abort(
 
 #if !defined(NDEBUG) || defined(KOKKOS_ENFORCE_CONTRACTS) || \
     defined(KOKKOS_ENABLE_DEBUG)
-#define KOKKOS_EXPECTS(...)                                               \
-  {                                                                       \
-    if (!bool(__VA_ARGS__)) {                                             \
-      ::Kokkos::abort(                                                    \
-          "Kokkos contract violation:\n  "                                \
-          "  Expected precondition `" #__VA_ARGS__ "` evaluated false."); \
-    }                                                                     \
+#define KOKKOS_EXPECTS(...)                                                    \
+  {                                                                            \
+    if (!bool(__VA_ARGS__)) {                                                  \
+      ::Kokkos::abort(                                                         \
+          "Kokkos contract violation:\n  "                                     \
+          "  Expected precondition `" #__VA_ARGS__                             \
+          "` evaluated false.\n"                                               \
+          "Error at " KOKKOS_IMPL_TOSTRING(__FILE__) ":" KOKKOS_IMPL_TOSTRING( \
+              __LINE__) " \n");                                                \
+    }                                                                          \
   }
-#define KOKKOS_ENSURES(...)                                               \
-  {                                                                       \
-    if (!bool(__VA_ARGS__)) {                                             \
-      ::Kokkos::abort(                                                    \
-          "Kokkos contract violation:\n  "                                \
-          "  Ensured postcondition `" #__VA_ARGS__ "` evaluated false."); \
-    }                                                                     \
+#define KOKKOS_ENSURES(...)                                                    \
+  {                                                                            \
+    if (!bool(__VA_ARGS__)) {                                                  \
+      ::Kokkos::abort(                                                         \
+          "Kokkos contract violation:\n  "                                     \
+          "  Ensured postcondition `" #__VA_ARGS__                             \
+          "` evaluated false.\n"                                               \
+          "Error at " KOKKOS_IMPL_TOSTRING(__FILE__) ":" KOKKOS_IMPL_TOSTRING( \
+              __LINE__) " \n");                                                \
+    }                                                                          \
   }
-// some projects already define this for themselves, so don't mess them up
+// some projects already define this for themselves, so don't mess
+// them up
 #ifndef KOKKOS_ASSERT
-#define KOKKOS_ASSERT(...)                                             \
-  {                                                                    \
-    if (!bool(__VA_ARGS__)) {                                          \
-      ::Kokkos::abort(                                                 \
-          "Kokkos contract violation:\n  "                             \
-          "  Asserted condition `" #__VA_ARGS__ "` evaluated false."); \
-    }                                                                  \
+#define KOKKOS_ASSERT(...)                                                     \
+  {                                                                            \
+    if (!bool(__VA_ARGS__)) {                                                  \
+      ::Kokkos::abort(                                                         \
+          "Kokkos contract violation:\n  "                                     \
+          "  Asserted condition `" #__VA_ARGS__                                \
+          "` evaluated false.\n"                                               \
+          "Error at " KOKKOS_IMPL_TOSTRING(__FILE__) ":" KOKKOS_IMPL_TOSTRING( \
+              __LINE__) " \n");                                                \
+    }                                                                          \
   }
 #endif  // ifndef KOKKOS_ASSERT
 #else   // not debug mode
diff --git a/packages/kokkos/core/src/impl/Kokkos_ExecSpaceInitializer.hpp b/packages/kokkos/core/src/impl/Kokkos_ExecSpaceInitializer.hpp
index a922e7e3f9b19d0413487674f847b539e9d4f10a..1a0b10e40fe5e280746c3c0443202a4413585a0c 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ExecSpaceInitializer.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ExecSpaceInitializer.hpp
@@ -55,6 +55,7 @@ class ExecSpaceInitializerBase {
   virtual void initialize(const InitArguments &args)                     = 0;
   virtual void finalize(const bool all_spaces)                           = 0;
   virtual void fence()                                                   = 0;
+  virtual void fence(const std::string &)                                = 0;
   virtual void print_configuration(std::ostream &msg, const bool detail) = 0;
   ExecSpaceInitializerBase()          = default;
   virtual ~ExecSpaceInitializerBase() = default;
diff --git a/packages/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp b/packages/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
index 22e88ebc4fc57d4e7132bca0be2aa55f5bfc5f69..5de92fc45741234aafaa97fb0c31dc11aa9d9c10 100644
--- a/packages/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
@@ -48,7 +48,6 @@
 #include <cstddef>
 #include <Kokkos_Core_fwd.hpp>
 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -1335,7 +1334,10 @@ struct FunctorValueTraits<FunctorType, ArgTag,
   using functor_type = FunctorType;
 
   static_assert(
-      IS_VOID || IS_REJECT || 0 == (sizeof(ValueType) % sizeof(int)),
+      IS_VOID || IS_REJECT ||
+          ((sizeof(ValueType) > sizeof(int))
+               ? 0 == sizeof(ValueType) % sizeof(int)
+               : true),
       "Reduction functor's value_type deduced from functor::operator() "
       "requires: 0 == sizeof(value_type) % sizeof(int)");
 
@@ -1902,17 +1904,6 @@ struct FunctorFinalFunction {
   KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
                                                         value_type&));
 
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // , value_type volatile & ) const ); KOKKOS_INLINE_FUNCTION static void
-  // enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile & )
-  // const ); KOKKOS_INLINE_FUNCTION static void enable_if( void
-  // (FunctorType::*)( ArgTag         , value_type volatile & ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // const & , value_type volatile & ) ); KOKKOS_INLINE_FUNCTION static void
-  // enable_if( void (             *)( ArgTag         , value_type volatile & )
-  // ); KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)(
-  // ArgTag const & , value_type volatile & ) );
-
   KOKKOS_INLINE_FUNCTION static void enable_if(
       void (FunctorType::*)(ArgTag, value_type const&) const);
   KOKKOS_INLINE_FUNCTION static void enable_if(
@@ -1925,17 +1916,6 @@ struct FunctorFinalFunction {
                                                         value_type const&));
   KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
                                                         value_type const&));
-
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // , value_type const volatile & ) const ); KOKKOS_INLINE_FUNCTION static void
-  // enable_if( void (FunctorType::*)( ArgTag const & , value_type const
-  // volatile & ) const ); KOKKOS_INLINE_FUNCTION static void enable_if( void
-  // (FunctorType::*)( ArgTag         , value_type const volatile & ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // const & , value_type const volatile & ) ); KOKKOS_INLINE_FUNCTION static
-  // void enable_if( void (             *)( ArgTag         , value_type const
-  // volatile & ) ); KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)(
-  // ArgTag const & , value_type const volatile & ) );
 };
 
 // Compatible functions for 'final' function and value_type is an array
@@ -1956,17 +1936,6 @@ struct FunctorFinalFunction<FunctorType, ArgTag, true> {
   KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
                                                         value_type*));
 
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // , value_type volatile * ) const ); KOKKOS_INLINE_FUNCTION static void
-  // enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile * )
-  // const ); KOKKOS_INLINE_FUNCTION static void enable_if( void
-  // (FunctorType::*)( ArgTag         , value_type volatile * ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // const & , value_type volatile * ) ); KOKKOS_INLINE_FUNCTION static void
-  // enable_if( void (             *)( ArgTag         , value_type volatile * )
-  // ); KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)(
-  // ArgTag const & , value_type volatile * ) );
-
   KOKKOS_INLINE_FUNCTION static void enable_if(
       void (FunctorType::*)(ArgTag, value_type const*) const);
   KOKKOS_INLINE_FUNCTION static void enable_if(
@@ -1979,17 +1948,6 @@ struct FunctorFinalFunction<FunctorType, ArgTag, true> {
                                                         value_type const*));
   KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
                                                         value_type const*));
-
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // , value_type const volatile * ) const ); KOKKOS_INLINE_FUNCTION static void
-  // enable_if( void (FunctorType::*)( ArgTag const & , value_type const
-  // volatile * ) const ); KOKKOS_INLINE_FUNCTION static void enable_if( void
-  // (FunctorType::*)( ArgTag         , value_type const volatile * ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // const & , value_type const volatile * ) ); KOKKOS_INLINE_FUNCTION static
-  // void enable_if( void (             *)( ArgTag         , value_type const
-  // volatile * ) ); KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)(
-  // ArgTag const & , value_type const volatile * ) );
 };
 
 template <class FunctorType>
@@ -2109,89 +2067,4 @@ struct FunctorFinal<
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-namespace Kokkos {
-namespace Impl {
-
-template <class FunctorType, class ArgTag,
-          class ReferenceType =
-              typename FunctorValueTraits<FunctorType, ArgTag>::reference_type>
-struct FunctorApplyFunction {
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag, ReferenceType) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag const&, ReferenceType) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag, ReferenceType));
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag const&, ReferenceType));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag, ReferenceType));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
-                                                        ReferenceType));
-};
-
-template <class FunctorType, class ReferenceType>
-struct FunctorApplyFunction<FunctorType, void, ReferenceType> {
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ReferenceType) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ReferenceType));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ReferenceType));
-};
-
-template <class FunctorType>
-struct FunctorApplyFunction<FunctorType, void, void> {
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (FunctorType::*)() const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (FunctorType::*)());
-};
-
-template <class FunctorType, class ArgTag, class ReferenceType,
-          class Enable = void>
-struct FunctorApply {
-  KOKKOS_FORCEINLINE_FUNCTION static void apply(const FunctorType&, void*) {}
-};
-
-/* 'apply' function provided for void value */
-template <class FunctorType, class ArgTag>
-struct FunctorApply<
-    FunctorType, ArgTag,
-    void
-    // First  substitution failure when FunctorType::apply does not exist.
-    // Second substitution failure when enable_if( & Functor::apply ) does not
-    // exist
-    ,
-    decltype(FunctorApplyFunction<FunctorType, ArgTag, void>::enable_if(
-        &FunctorType::apply))> {
-  KOKKOS_FORCEINLINE_FUNCTION static void apply(FunctorType& f) { f.apply(); }
-
-  KOKKOS_FORCEINLINE_FUNCTION static void apply(const FunctorType& f) {
-    f.apply();
-  }
-};
-
-/* 'apply' function provided for single value */
-template <class FunctorType, class ArgTag, class T>
-struct FunctorApply<FunctorType, ArgTag,
-                    T&
-                    // First  substitution failure when FunctorType::apply does
-                    // not exist. Second substitution failure when enable_if( &
-                    // Functor::apply ) does not exist
-                    ,
-                    decltype(
-                        FunctorApplyFunction<FunctorType, ArgTag>::enable_if(
-                            &FunctorType::apply))> {
-  KOKKOS_FORCEINLINE_FUNCTION static void apply(const FunctorType& f, void* p) {
-    f.apply(*((T*)p));
-  }
-
-  KOKKOS_FORCEINLINE_FUNCTION static void apply(FunctorType& f, void* p) {
-    f.apply(*((T*)p));
-  }
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
 #endif /* KOKKOS_FUNCTORADAPTER_HPP */
diff --git a/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp b/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
index a56d19ee722668389b4b43bc377d79bb7fd9799b..7140154e0f6f276dc928dc5f3a73cda97f6e2cec 100644
--- a/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
@@ -48,7 +48,6 @@
 #include <cstddef>
 #include <Kokkos_Core_fwd.hpp>
 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -722,14 +721,16 @@ struct FunctorAnalysis {
 
     template <bool IsArray>
     KOKKOS_INLINE_FUNCTION constexpr
-        typename std::enable_if<IsArray, FunctorAnalysis::ValueType*>::type
+        typename std::enable_if<IsArray,
+                                typename FunctorAnalysis::ValueType*>::type
         ref() const noexcept {
       return m_result;
     }
 
     template <bool IsArray>
     KOKKOS_INLINE_FUNCTION constexpr
-        typename std::enable_if<!IsArray, FunctorAnalysis::ValueType&>::type
+        typename std::enable_if<!IsArray,
+                                typename FunctorAnalysis::ValueType&>::type
         ref() const noexcept {
       return *m_result;
     }
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostSharedPtr.hpp b/packages/kokkos/core/src/impl/Kokkos_HostSharedPtr.hpp
index 97286dd07f4ea2ee94f3070768f425e2ef5b7896..3b7b194db58cb693f69d8a6560896565062b9d99 100644
--- a/packages/kokkos/core/src/impl/Kokkos_HostSharedPtr.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_HostSharedPtr.hpp
@@ -47,6 +47,7 @@
 
 #include <Kokkos_Macros.hpp>
 #include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_Error.hpp>
 
 #include <functional>
 
@@ -92,6 +93,8 @@ class HostSharedPtr {
     // FIXME_OPENMPTARGET requires something like KOKKOS_IMPL_IF_ON_HOST
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     if (m_control) Kokkos::atomic_add(&(m_control->m_counter), 1);
+#else
+    m_control = nullptr;
 #endif
   }
 
@@ -115,6 +118,8 @@ class HostSharedPtr {
       // FIXME_OPENMPTARGET
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
       if (m_control) Kokkos::atomic_add(&(m_control->m_counter), 1);
+#else
+      m_control = nullptr;
 #endif
     }
     return *this;
@@ -154,6 +159,9 @@ class HostSharedPtr {
     // object pointed to by m_counter and m_element_ptr.
     if (m_control) {
       int const count = Kokkos::atomic_fetch_sub(&(m_control->m_counter), 1);
+      // atomic_fetch_sub might have memory order relaxed so we need to force
+      // synchronization to avoid multiple threads doing the cleanup.
+      Kokkos::memory_fence();
       if (count == 1) {
         (m_control->m_deleter)(m_element_ptr);
         m_element_ptr = nullptr;
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
index 2e5587e4a342c8c2b167f307f8c8b3a3215f304a..a7f4a652befb148bafff866d8949edb9f2520eaa 100644
--- a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
@@ -74,8 +74,8 @@ void HostThreadTeamData::organize_pool(HostThreadTeamData *members[],
     }
 
     {
-      HostThreadTeamData **const pool =
-          (HostThreadTeamData **)(root_scratch + m_pool_members);
+      HostThreadTeamData **const pool = reinterpret_cast<HostThreadTeamData **>(
+          root_scratch + m_pool_members);
 
       // team size == 1, league size == pool_size
 
@@ -136,7 +136,8 @@ int HostThreadTeamData::organize_team(const int team_size) {
     if (team_size == 1) return 1;  // Already organized in teams of one
 
     HostThreadTeamData *const *const pool =
-        (HostThreadTeamData **)(m_pool_scratch + m_pool_members);
+        reinterpret_cast<HostThreadTeamData **>(m_pool_scratch +
+                                                m_pool_members);
 
     // "league_size" in this context is the number of concurrent teams
     // that the pool can accommodate.  Excess threads are idle.
@@ -239,7 +240,8 @@ int HostThreadTeamData::get_work_stealing() noexcept {
 
     if (w.first == -1 && m_steal_rank != m_pool_rank) {
       HostThreadTeamData *const *const pool =
-          (HostThreadTeamData **)(m_pool_scratch + m_pool_members);
+          reinterpret_cast<HostThreadTeamData **>(m_pool_scratch +
+                                                  m_pool_members);
 
       // Attempt from beginning failed, try to steal from end of neighbor
 
@@ -287,23 +289,17 @@ int HostThreadTeamData::get_work_stealing() noexcept {
 
     if (1 < m_team_size) {
       // Must share the work index
-      *((int volatile *)team_reduce()) = w.first;
+      *reinterpret_cast<int volatile *>(team_reduce()) = w.first;
 
       team_rendezvous_release();
     }
   } else if (1 < m_team_size) {
-    w.first = *((int volatile *)team_reduce());
+    w.first = *reinterpret_cast<int volatile *>(team_reduce());
   }
 
   // May exit because successfully stole work and w is good.
   // May exit because no work left to steal and w = (-1,-1).
 
-#if 0
-fprintf(stdout,"HostThreadTeamData::get_work_stealing() pool(%d of %d) %d\n"
-       , m_pool_rank , m_pool_size , w.first );
-fflush(stdout);
-#endif
-
   return w.first;
 }
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
index d4cae7f122ed182cf88522d5d60729a0906cce5b..0652b55bb71cfb3923374e774bbb2db2f58ee90d 100644
--- a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
@@ -91,9 +91,18 @@ class HostThreadTeamData {
   //   [ thread_local ]     = [ m_thread_local    .. m_scratch_size )
 
   enum : int { m_pool_members = 0 };
-  enum : int { m_pool_rendezvous = m_pool_members + max_pool_members };
-  enum : int { m_team_rendezvous = m_pool_rendezvous + max_pool_rendezvous };
-  enum : int { m_pool_reduce = m_team_rendezvous + max_team_rendezvous };
+  enum : int {
+    m_pool_rendezvous =
+        static_cast<int>(m_pool_members) + static_cast<int>(max_pool_members)
+  };
+  enum : int {
+    m_team_rendezvous = static_cast<int>(m_pool_rendezvous) +
+                        static_cast<int>(max_pool_rendezvous)
+  };
+  enum : int {
+    m_pool_reduce = static_cast<int>(m_team_rendezvous) +
+                    static_cast<int>(max_team_rendezvous)
+  };
 
   using pair_int_t = Kokkos::pair<int64_t, int64_t>;
 
@@ -120,13 +129,13 @@ class HostThreadTeamData {
   int mutable m_team_rendezvous_step;
 
   HostThreadTeamData* team_member(int r) const noexcept {
-    return ((HostThreadTeamData**)(m_pool_scratch +
-                                   m_pool_members))[m_team_base + r];
+    return (reinterpret_cast<HostThreadTeamData**>(
+        m_pool_scratch + m_pool_members))[m_team_base + r];
   }
 
  public:
   inline bool team_rendezvous() const noexcept {
-    int* ptr = (int*)(m_team_scratch + m_team_rendezvous);
+    int* ptr = reinterpret_cast<int*>(m_team_scratch + m_team_rendezvous);
     HostBarrier::split_arrive(ptr, m_team_size, m_team_rendezvous_step);
     if (m_team_rank != 0) {
       HostBarrier::wait(ptr, m_team_size, m_team_rendezvous_step);
@@ -138,7 +147,7 @@ class HostThreadTeamData {
   }
 
   inline bool team_rendezvous(const int source_team_rank) const noexcept {
-    int* ptr = (int*)(m_team_scratch + m_team_rendezvous);
+    int* ptr = reinterpret_cast<int*>(m_team_scratch + m_team_rendezvous);
     HostBarrier::split_arrive(ptr, m_team_size, m_team_rendezvous_step);
     if (m_team_rank != source_team_rank) {
       HostBarrier::wait(ptr, m_team_size, m_team_rendezvous_step);
@@ -150,12 +159,13 @@ class HostThreadTeamData {
   }
 
   inline void team_rendezvous_release() const noexcept {
-    HostBarrier::split_release((int*)(m_team_scratch + m_team_rendezvous),
-                               m_team_size, m_team_rendezvous_step);
+    HostBarrier::split_release(
+        reinterpret_cast<int*>(m_team_scratch + m_team_rendezvous), m_team_size,
+        m_team_rendezvous_step);
   }
 
   inline int pool_rendezvous() const noexcept {
-    int* ptr = (int*)(m_pool_scratch + m_pool_rendezvous);
+    int* ptr = reinterpret_cast<int*>(m_pool_scratch + m_pool_rendezvous);
     HostBarrier::split_arrive(ptr, m_pool_size, m_pool_rendezvous_step);
     if (m_pool_rank != 0) {
       HostBarrier::wait(ptr, m_pool_size, m_pool_rendezvous_step);
@@ -167,8 +177,9 @@ class HostThreadTeamData {
   }
 
   inline void pool_rendezvous_release() const noexcept {
-    HostBarrier::split_release((int*)(m_pool_scratch + m_pool_rendezvous),
-                               m_pool_size, m_pool_rendezvous_step);
+    HostBarrier::split_release(
+        reinterpret_cast<int*>(m_pool_scratch + m_pool_rendezvous), m_pool_size,
+        m_pool_rendezvous_step);
   }
 
   //----------------------------------------
@@ -230,7 +241,8 @@ class HostThreadTeamData {
   constexpr int pool_size() const { return m_pool_size; }
 
   HostThreadTeamData* pool_member(int r) const noexcept {
-    return ((HostThreadTeamData**)(m_pool_scratch + m_pool_members))[r];
+    return (reinterpret_cast<HostThreadTeamData**>(m_pool_scratch +
+                                                   m_pool_members))[r];
   }
 
   //----------------------------------------
@@ -330,24 +342,11 @@ class HostThreadTeamData {
     team_shared_size = align_to_int64(team_shared_size);
     // thread_local_size = align_to_int64( thread_local_size );
 
-    m_scratch      = (int64_t*)alloc_ptr;
+    m_scratch      = static_cast<int64_t*>(alloc_ptr);
     m_team_reduce  = m_pool_reduce + pool_reduce_size;
     m_team_shared  = m_team_reduce + team_reduce_size;
     m_thread_local = m_team_shared + team_shared_size;
     m_scratch_size = align_to_int64(alloc_size);
-
-#if 0
-fprintf(stdout,"HostThreadTeamData::scratch_assign { %d %d %d %d %d %d %d }\n"
-       , int(m_pool_members)
-       , int(m_pool_rendezvous)
-       , int(m_pool_reduce)
-       , int(m_team_reduce)
-       , int(m_team_shared)
-       , int(m_thread_local)
-       , int(m_scratch_size)
-       );
-fflush(stdout);
-#endif
   }
 
   //----------------------------------------
diff --git a/packages/kokkos/core/src/impl/Kokkos_LinkedListNode.hpp b/packages/kokkos/core/src/impl/Kokkos_LinkedListNode.hpp
index 79aeca5da0691264c4cb215f62e17bdb8dbe95e1..1ed502db5be61a2501b57f08969ff47ce12bdbe5 100644
--- a/packages/kokkos/core/src/impl/Kokkos_LinkedListNode.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_LinkedListNode.hpp
@@ -110,7 +110,7 @@ struct SimpleSinglyLinkedListNode {
   friend struct LinkedListNodeAccess;
 
  public:
-  // KOKKOS_CONSTEXPR_14
+  // constexpr
   KOKKOS_INLINE_FUNCTION
   bool is_enqueued() const noexcept {
     // TODO @tasking @memory_order DSH make this an atomic load with memory
@@ -118,7 +118,7 @@ struct SimpleSinglyLinkedListNode {
     return m_next != reinterpret_cast<pointer_type>(NotEnqueuedValue);
   }
 
-  // KOKKOS_CONSTEXPR_14
+  // constexpr
   KOKKOS_INLINE_FUNCTION
   bool is_enqueued() const volatile noexcept {
     // TODO @tasking @memory_order DSH make this an atomic load with memory
diff --git a/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp b/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
index 76d553601923fd7282132fbff05ce69a4e576e97..865d1c47faacfe3d6b39d4227bb180bf483c89dd 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
@@ -48,7 +48,7 @@
 namespace Kokkos {
 
 //----------------------------------------------------------------------------
-
+#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
 KOKKOS_FORCEINLINE_FUNCTION
 void memory_fence() {
 #if defined(__CUDA_ARCH__)
@@ -75,6 +75,7 @@ void memory_fence() {
 #error "Error: memory_fence() not defined"
 #endif
 }
+#endif
 
 //////////////////////////////////////////////////////
 // store_fence()
diff --git a/packages/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp b/packages/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp
index fe78cfbacc632d353844a5cb17f89a5d5ba067ff..1c61b73f027aaefa4993aaae7beee3ca9af05110 100644
--- a/packages/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp
@@ -58,8 +58,7 @@
 
 #include <impl/Kokkos_TaskQueueMemoryManager.hpp>
 #include <impl/Kokkos_TaskQueueCommon.hpp>
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <impl/Kokkos_Atomic_Increment.hpp>
+#include <Kokkos_Atomic.hpp>
 #include <impl/Kokkos_OptionalRef.hpp>
 #include <impl/Kokkos_LIFO.hpp>
 
@@ -467,7 +466,7 @@ class MultipleTaskQueue final
 
   // TODO @tasking @generalization DSH make this a property-based customization
   // point
-  static /* KOKKOS_CONSTEXPR_14 */ size_t task_queue_allocation_size(
+  static /* constexpr */ size_t task_queue_allocation_size(
       typename base_t::execution_space const& exec_space,
       typename base_t::memory_space const&,
       typename base_t::memory_pool const&) {
diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp b/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp
index 94ea6e1a2b10c33a81e4f2c6b7a932577ce6144b..8505e8f51aec744c00dfe2fef3f29d1eb9cb7306 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp
@@ -53,6 +53,7 @@
 #include <array>
 #include <cstring>
 #include <iostream>
+#include <memory>
 #include <stack>
 #include <unordered_map>
 #include <unordered_set>
@@ -70,7 +71,9 @@ void tool_invoked_fence(const uint32_t /* devID */) {
    * Eventually we want to support fencing only
    * a given stream/resource
    */
-  Kokkos::fence();
+  Kokkos::fence(
+      "Kokkos::Tools::Experimental::Impl::tool_invoked_fence: Tool Requested "
+      "Fence");
 }
 }  // namespace Impl
 #ifdef KOKKOS_ENABLE_TUNING
@@ -131,7 +134,8 @@ inline void invoke_kokkosp_callback(
     if (may_require_global_fencing == MayRequireGlobalFencing::Yes &&
         (Kokkos::Tools::Experimental::tool_requirements
              .requires_global_fencing)) {
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Tools::invoke_kokkosp_callback: Kokkos Profile Tool Fence");
     }
     (*callback)(std::forward<Args>(args)...);
   }
@@ -432,18 +436,43 @@ void initialize(const std::string& profileLibrary) {
   if (is_initialized) return;
   is_initialized = 1;
 
+  auto invoke_init_callbacks = []() {
+    Experimental::invoke_kokkosp_callback(
+        Kokkos::Tools::Experimental::MayRequireGlobalFencing::No,
+        Kokkos::Tools::Experimental::current_callbacks.init, 0,
+        (uint64_t)KOKKOSP_INTERFACE_VERSION, (uint32_t)0, nullptr);
+
+    Experimental::tool_requirements.requires_global_fencing = true;
+
+    Experimental::invoke_kokkosp_callback(
+        Experimental::MayRequireGlobalFencing::No,
+        Experimental::current_callbacks.request_tool_settings, 1,
+        &Experimental::tool_requirements);
+
+    Experimental::ToolProgrammingInterface actions;
+    actions.fence = &Experimental::Impl::tool_invoked_fence;
+
+    Experimental::invoke_kokkosp_callback(
+        Experimental::MayRequireGlobalFencing::No,
+        Experimental::current_callbacks.provide_tool_programming_interface, 1,
+        actions);
+  };
+
 #ifdef KOKKOS_ENABLE_LIBDL
   void* firstProfileLibrary = nullptr;
 
-  if (profileLibrary.empty()) return;
+  if (profileLibrary.empty()) {
+    invoke_init_callbacks();
+    return;
+  }
 
   char* envProfileLibrary = const_cast<char*>(profileLibrary.c_str());
 
-  char* envProfileCopy =
-      (char*)malloc(sizeof(char) * (strlen(envProfileLibrary) + 1));
-  sprintf(envProfileCopy, "%s", envProfileLibrary);
+  const auto envProfileCopy =
+      std::make_unique<char[]>(strlen(envProfileLibrary) + 1);
+  sprintf(envProfileCopy.get(), "%s", envProfileLibrary);
 
-  char* profileLibraryName = strtok(envProfileCopy, ";");
+  char* profileLibraryName = strtok(envProfileCopy.get(), ";");
 
   if ((profileLibraryName != nullptr) &&
       (strcmp(profileLibraryName, "") != 0)) {
@@ -574,25 +603,8 @@ void initialize(const std::string& profileLibrary) {
 #else
   (void)profileLibrary;
 #endif  // KOKKOS_ENABLE_LIBDL
-  Experimental::invoke_kokkosp_callback(
-      Kokkos::Tools::Experimental::MayRequireGlobalFencing::No,
-      Kokkos::Tools::Experimental::current_callbacks.init, 0,
-      (uint64_t)KOKKOSP_INTERFACE_VERSION, (uint32_t)0, nullptr);
-
-  Experimental::tool_requirements.requires_global_fencing = true;
-
-  Experimental::invoke_kokkosp_callback(
-      Experimental::MayRequireGlobalFencing::No,
-      Experimental::current_callbacks.request_tool_settings, 1,
-      &Experimental::tool_requirements);
 
-  Experimental::ToolProgrammingInterface actions;
-  actions.fence = &Experimental::Impl::tool_invoked_fence;
-
-  Experimental::invoke_kokkosp_callback(
-      Experimental::MayRequireGlobalFencing::No,
-      Experimental::current_callbacks.provide_tool_programming_interface, 1,
-      actions);
+  invoke_init_callbacks();
 
 #ifdef KOKKOS_ENABLE_TUNING
   Experimental::VariableInfo kernel_name;
@@ -656,9 +668,6 @@ void initialize(const std::string& profileLibrary) {
   Experimental::no_profiling.declare_output_type   = nullptr;
   Experimental::no_profiling.request_output_values = nullptr;
   Experimental::no_profiling.end_tuning_context    = nullptr;
-#ifdef KOKKOS_ENABLE_LIBDL
-  free(envProfileCopy);
-#endif
 }
 
 void finalize() {
diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling.hpp b/packages/kokkos/core/src/impl/Kokkos_Profiling.hpp
index 1ff6a36c3bc3c934e787af30c5bd6568046f15f1..86a4cfa4a8543e58feab3aee24f1a9ac8530bb5e 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Profiling.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Profiling.hpp
@@ -50,9 +50,12 @@
 #include <Kokkos_Macros.hpp>
 #include <Kokkos_Tuners.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
+#include <memory>
+#include <unordered_map>
 #include <map>
 #include <string>
 #include <type_traits>
+#include <mutex>
 namespace Kokkos {
 
 // forward declaration
@@ -135,6 +138,71 @@ Kokkos_Profiling_SpaceHandle make_space_handle(const char* space_name);
 
 namespace Experimental {
 
+namespace Impl {
+struct DirectFenceIDHandle {
+  uint32_t value;
+};
+//
+template <typename Space>
+uint32_t idForInstance(const uintptr_t instance) {
+  static std::mutex instance_mutex;
+  const std::lock_guard<std::mutex> lock(instance_mutex);
+  /** Needed to be a ptr due to initialization order problems*/
+  using map_type = std::map<uintptr_t, uint32_t>;
+
+  static std::shared_ptr<map_type> map;
+  if (map.get() == nullptr) {
+    map = std::make_shared<map_type>(map_type());
+  }
+
+  static uint32_t value = 0;
+  constexpr const uint32_t offset =
+      Kokkos::Tools::Experimental::NumReservedDeviceIDs;
+
+  auto find = map->find(instance);
+  if (find == map->end()) {
+    auto ret         = offset + value++;
+    (*map)[instance] = ret;
+    return ret;
+  }
+
+  return find->second;
+}
+
+template <typename Space, typename FencingFunctor>
+void profile_fence_event(const std::string& name, DirectFenceIDHandle devIDTag,
+                         const FencingFunctor& func) {
+  uint64_t handle = 0;
+  Kokkos::Tools::beginFence(
+      name,
+      Kokkos::Tools::Experimental::device_id_root<Space>() + devIDTag.value,
+      &handle);
+  func();
+  Kokkos::Tools::endFence(handle);
+}
+
+inline uint32_t int_for_synchronization_reason(
+    Kokkos::Tools::Experimental::SpecialSynchronizationCases reason) {
+  switch (reason) {
+    case GlobalDeviceSynchronization: return 0;
+    case DeepCopyResourceSynchronization: return 0x00ffffff;
+  }
+  return 0;
+}
+
+template <typename Space, typename FencingFunctor>
+void profile_fence_event(
+    const std::string& name,
+    Kokkos::Tools::Experimental::SpecialSynchronizationCases reason,
+    const FencingFunctor& func) {
+  uint64_t handle = 0;
+  Kokkos::Tools::beginFence(
+      name, device_id_root<Space>() + int_for_synchronization_reason(reason),
+      &handle);  // TODO: correct ID
+  func();
+  Kokkos::Tools::endFence(handle);
+}
+}  // namespace Impl
 void set_init_callback(initFunction callback);
 void set_finalize_callback(finalizeFunction callback);
 void set_parse_args_callback(parseArgsFunction callback);
diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h b/packages/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h
index ed8751c50cc04d915b7b3c371a6ec05756ff6087..2c8d1428fc595e0e6724624465ab839f5c80b138 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h
+++ b/packages/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h
@@ -54,7 +54,7 @@
 #include <stdbool.h>
 #endif
 
-#define KOKKOSP_INTERFACE_VERSION 20210225
+#define KOKKOSP_INTERFACE_VERSION 20210623
 
 // Profiling
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp b/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
index 7809632f78ddf33d8429b353723736b68e3b7536..a7aec2e6fd53f6a6b37b011452b58750b092f096 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
@@ -56,6 +56,14 @@
 namespace Kokkos {
 namespace Tools {
 namespace Experimental {
+
+constexpr const uint32_t NumReservedDeviceIDs = 1;
+
+enum SpecialSynchronizationCases : int {
+  GlobalDeviceSynchronization     = 1,
+  DeepCopyResourceSynchronization = 2,
+};
+
 enum struct DeviceType {
   Serial,
   OpenMP,
@@ -68,15 +76,49 @@ enum struct DeviceType {
   Unknown
 };
 
+struct ExecutionSpaceIdentifier {
+  DeviceType type;
+  uint32_t device_id;
+  uint32_t instance_id;
+};
+inline DeviceType devicetype_from_uint32t(const uint32_t in) {
+  switch (in) {
+    case 0: return DeviceType::Serial;
+    case 1: return DeviceType::OpenMP;
+    case 2: return DeviceType::Cuda;
+    case 3: return DeviceType::HIP;
+    case 4: return DeviceType::OpenMPTarget;
+    case 5: return DeviceType::HPX;
+    case 6: return DeviceType::Threads;
+    case 7: return DeviceType::SYCL;
+    default: return DeviceType::Unknown;  // TODO: error out?
+  }
+}
+
+inline ExecutionSpaceIdentifier identifier_from_devid(const uint32_t in) {
+  // ExecutionSpaceIdentifier out;
+  // out.type = in >> 24;
+  // out.device_id = in >> 17;
+  // out.instance_id = ((uint32_t(-1)) << 17 ) & in;
+  return {devicetype_from_uint32t(in >> 24),
+          (~((uint32_t(-1)) << 24)) & (in >> 17),
+          (~((uint32_t(-1)) << 17)) & in};
+}
+
 template <typename ExecutionSpace>
 struct DeviceTypeTraits;
 
 constexpr const size_t device_type_bits = 8;
 constexpr const size_t instance_bits    = 24;
 template <typename ExecutionSpace>
+constexpr uint32_t device_id_root() {
+  constexpr auto device_id =
+      static_cast<uint32_t>(DeviceTypeTraits<ExecutionSpace>::id);
+  return (device_id << instance_bits);
+}
+template <typename ExecutionSpace>
 inline uint32_t device_id(ExecutionSpace const& space) noexcept {
-  auto device_id = static_cast<uint32_t>(DeviceTypeTraits<ExecutionSpace>::id);
-  return (device_id << instance_bits) + space.impl_instance_id();
+  return device_id_root<ExecutionSpace>() + space.impl_instance_id();
 }
 }  // namespace Experimental
 }  // namespace Tools
diff --git a/packages/kokkos/core/src/impl/Kokkos_QuadPrecisionMath.hpp b/packages/kokkos/core/src/impl/Kokkos_QuadPrecisionMath.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b67cede45bfddd657448a054aca3254d627267ce
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_QuadPrecisionMath.hpp
@@ -0,0 +1,187 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_QUAD_PRECISION_MATH_HPP
+#define KOKKOS_QUAD_PRECISION_MATH_HPP
+
+#include <Kokkos_Macros.hpp>
+
+#if defined(KOKKOS_ENABLE_LIBQUADMATH)
+
+#include <Kokkos_NumericTraits.hpp>
+
+#include <quadmath.h>
+
+#if !(defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__))
+#error __float128 not supported on this host
+#endif
+
+//<editor-fold desc="numeric traits __float128 specializations">
+namespace Kokkos {
+namespace Experimental {
+#if defined(KOKKOS_ENABLE_CXX17)
+#define KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(TRAIT, TYPE, VALUE_TYPE, VALUE) \
+  template <>                                                                \
+  struct TRAIT<TYPE> {                                                       \
+    static constexpr VALUE_TYPE value = VALUE;                               \
+  };                                                                         \
+  template <>                                                                \
+  inline constexpr auto TRAIT##_v<TYPE> = TRAIT<TYPE>::value;
+#else
+#define KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(TRAIT, TYPE, VALUE_TYPE, VALUE) \
+  template <>                                                                \
+  struct TRAIT<TYPE> {                                                       \
+    static constexpr VALUE_TYPE value = VALUE;                               \
+  };
+#endif
+
+// clang-format off
+// Numeric distinguished value traits
+// Workaround GCC bug https://godbolt.org/z/qWb5oe4dx
+// error: '__builtin_huge_valq()' is not a constant expression
+#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU >= 710)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(infinity,       __float128, __float128, HUGE_VALQ)
+#endif
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(finite_min,     __float128, __float128, -FLT128_MAX)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(finite_max,     __float128, __float128, FLT128_MAX)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(epsilon,        __float128, __float128, FLT128_EPSILON)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(round_error,    __float128, __float128, static_cast<__float128>(0.5))
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(norm_min,       __float128, __float128, FLT128_MIN)
+
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(digits,         __float128,        int, FLT128_MANT_DIG)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(digits10,       __float128,        int, FLT128_DIG)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(max_digits10,   __float128,        int, 36)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(radix,          __float128,        int, 2)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(min_exponent,   __float128,        int, FLT128_MIN_EXP)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(max_exponent,   __float128,        int, FLT128_MAX_EXP)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(min_exponent10, __float128,        int, FLT128_MIN_10_EXP)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(max_exponent10, __float128,        int, FLT128_MAX_10_EXP)
+// clang-format on
+
+#undef KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT
+}  // namespace Experimental
+}  // namespace Kokkos
+//</editor-fold>
+
+namespace Kokkos {
+template <>
+struct reduction_identity<__float128> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static __float128 sum() {
+    return static_cast<__float128>(0.0);
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static __float128 prod() {
+    return static_cast<__float128>(1.0);
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static __float128 max() {
+    return -FLT128_MAX;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static __float128 min() {
+    return FLT128_MAX;
+  }
+};
+}  // namespace Kokkos
+
+//<editor-fold desc="Common mathematical functions __float128 overloads">
+namespace Kokkos {
+namespace Experimental {
+// clang-format off
+// Basic operations
+inline __float128 fabs(__float128 x) { return ::fabsq(x); }
+inline __float128 fmod(__float128 x, __float128 y) { return ::fmodq(x, y); }
+inline __float128 remainder(__float128 x, __float128 y) { return ::remainderq(x, y); }
+inline __float128 fmin(__float128 x, __float128 y) { return ::fminq(x, y); }
+inline __float128 fmax(__float128 x, __float128 y) { return ::fmaxq(x, y); }
+inline __float128 fdim(__float128 x, __float128 y) { return ::fdimq(x, y); }
+inline __float128 nanq(char const* arg) { return ::nanq(arg); }
+// Power functions
+inline __float128 pow(__float128 x, __float128 y) { return ::powq(x, y); }
+inline __float128 sqrt(__float128 x) { return ::sqrtq(x); }
+inline __float128 cbrt(__float128 x) { return ::cbrtq(x); }
+inline __float128 hypot(__float128 x, __float128 y) { return ::hypotq(x, y); }
+// Exponential functions
+inline __float128 exp(__float128 x) { return ::expq(x); }
+#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU >= 910)
+inline __float128 exp2(__float128 x) { return ::exp2q(x); }
+#endif
+inline __float128 expm1(__float128 x) { return ::expm1q(x); }
+inline __float128 log(__float128 x) { return ::logq(x); }
+inline __float128 log10(__float128 x) { return ::log10q(x); }
+inline __float128 log2(__float128 x) { return ::log2q(x); }
+inline __float128 log1p(__float128 x) { return ::log1pq(x); }
+// Trigonometric functions
+inline __float128 sin(__float128 x) { return ::sinq(x); }
+inline __float128 cos(__float128 x) { return ::cosq(x); }
+inline __float128 tan(__float128 x) { return ::tanq(x); }
+inline __float128 asin(__float128 x) { return ::asinq(x); }
+inline __float128 acos(__float128 x) { return ::acosq(x); }
+inline __float128 atan(__float128 x) { return ::atanq(x); }
+inline __float128 atan2(__float128 x, __float128 y) { return ::atan2q(x, y); }
+// Hyperbolic functions
+inline __float128 sinh(__float128 x) { return ::sinhq(x); }
+inline __float128 cosh(__float128 x) { return ::coshq(x); }
+inline __float128 tanh(__float128 x) { return ::tanhq(x); }
+inline __float128 asinh(__float128 x) { return ::asinhq(x); }
+inline __float128 acosh(__float128 x) { return ::acoshq(x); }
+inline __float128 atanh(__float128 x) { return ::atanhq(x); }
+// Error and gamma functions
+inline __float128 erf(__float128 x) { return ::erfq(x); }
+inline __float128 erfc(__float128 x) { return ::erfcq(x); }
+inline __float128 tgamma(__float128 x) { return ::tgammaq(x); }
+inline __float128 lgamma(__float128 x) { return ::lgammaq(x); }
+// Nearest integer floating point operations
+inline __float128 ceil(__float128 x) { return ::ceilq(x); }
+inline __float128 floor(__float128 x) { return ::floorq(x); }
+inline __float128 trunc(__float128 x) { return ::truncq(x); }
+inline __float128 nearbyint(__float128 x) { return ::nearbyintq(x); }
+// Classification and comparison
+inline bool isfinite(__float128 x) { return !::isinfq(x); }  // isfiniteq not provided
+inline bool isinf(__float128 x) { return ::isinfq(x); }
+inline bool isnan(__float128 x) { return ::isnanq(x); }
+}  // namespace Experimental
+}  // namespace Kokkos
+//</editor-fold>
+
+#endif
+
+#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Serial.cpp b/packages/kokkos/core/src/impl/Kokkos_Serial.cpp
index 4bd037906506bd27654067d5c2fda99fb59684ca..c49e838d8f0b0961b9dfd2bc76c07b5370cb2629 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Serial.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Serial.cpp
@@ -58,28 +58,59 @@
 
 namespace Kokkos {
 namespace Impl {
-namespace {
 
-HostThreadTeamData g_serial_thread_team_data;
+bool SerialInternal::is_initialized() { return m_is_initialized; }
 
-bool g_serial_is_initialized = false;
+void SerialInternal::initialize() {
+  if (is_initialized()) return;
 
-}  // namespace
+  Impl::SharedAllocationRecord<void, void>::tracking_enable();
+
+  // Init the array of locks used for arbitrarily sized atomics
+  Impl::init_lock_array_host_space();
+
+  m_is_initialized = true;
+}
+
+void SerialInternal::finalize() {
+  if (m_thread_team_data.scratch_buffer()) {
+    m_thread_team_data.disband_team();
+    m_thread_team_data.disband_pool();
+
+    Kokkos::HostSpace space;
+
+    space.deallocate(m_thread_team_data.scratch_buffer(),
+                     m_thread_team_data.scratch_bytes());
+
+    m_thread_team_data.scratch_assign(nullptr, 0, 0, 0, 0, 0);
+  }
+
+  Kokkos::Profiling::finalize();
+
+  m_is_initialized = false;
+}
+
+SerialInternal& SerialInternal::singleton() {
+  static SerialInternal* self = nullptr;
+  if (!self) {
+    self = new SerialInternal();
+  }
+  return *self;
+}
 
 // Resize thread team data scratch memory
-void serial_resize_thread_team_data(size_t pool_reduce_bytes,
-                                    size_t team_reduce_bytes,
-                                    size_t team_shared_bytes,
-                                    size_t thread_local_bytes) {
+void SerialInternal::resize_thread_team_data(size_t pool_reduce_bytes,
+                                             size_t team_reduce_bytes,
+                                             size_t team_shared_bytes,
+                                             size_t thread_local_bytes) {
   if (pool_reduce_bytes < 512) pool_reduce_bytes = 512;
   if (team_reduce_bytes < 512) team_reduce_bytes = 512;
 
-  const size_t old_pool_reduce = g_serial_thread_team_data.pool_reduce_bytes();
-  const size_t old_team_reduce = g_serial_thread_team_data.team_reduce_bytes();
-  const size_t old_team_shared = g_serial_thread_team_data.team_shared_bytes();
-  const size_t old_thread_local =
-      g_serial_thread_team_data.thread_local_bytes();
-  const size_t old_alloc_bytes = g_serial_thread_team_data.scratch_bytes();
+  const size_t old_pool_reduce  = m_thread_team_data.pool_reduce_bytes();
+  const size_t old_team_reduce  = m_thread_team_data.team_reduce_bytes();
+  const size_t old_team_shared  = m_thread_team_data.team_shared_bytes();
+  const size_t old_thread_local = m_thread_team_data.thread_local_bytes();
+  const size_t old_alloc_bytes  = m_thread_team_data.scratch_bytes();
 
   // Allocate if any of the old allocation is tool small:
 
@@ -92,12 +123,12 @@ void serial_resize_thread_team_data(size_t pool_reduce_bytes,
     Kokkos::HostSpace space;
 
     if (old_alloc_bytes) {
-      g_serial_thread_team_data.disband_team();
-      g_serial_thread_team_data.disband_pool();
+      m_thread_team_data.disband_team();
+      m_thread_team_data.disband_pool();
 
       space.deallocate("Kokkos::Serial::scratch_mem",
-                       g_serial_thread_team_data.scratch_buffer(),
-                       g_serial_thread_team_data.scratch_bytes());
+                       m_thread_team_data.scratch_buffer(),
+                       m_thread_team_data.scratch_bytes());
     }
 
     if (pool_reduce_bytes < old_pool_reduce) {
@@ -125,56 +156,37 @@ void serial_resize_thread_team_data(size_t pool_reduce_bytes,
       Kokkos::Impl::throw_runtime_exception(failure.get_error_message());
     }
 
-    g_serial_thread_team_data.scratch_assign(
-        ((char*)ptr), alloc_bytes, pool_reduce_bytes, team_reduce_bytes,
-        team_shared_bytes, thread_local_bytes);
+    m_thread_team_data.scratch_assign(static_cast<char*>(ptr), alloc_bytes,
+                                      pool_reduce_bytes, team_reduce_bytes,
+                                      team_shared_bytes, thread_local_bytes);
 
-    HostThreadTeamData* pool[1] = {&g_serial_thread_team_data};
+    HostThreadTeamData* pool[1] = {&m_thread_team_data};
 
-    g_serial_thread_team_data.organize_pool(pool, 1);
-    g_serial_thread_team_data.organize_team(1);
+    m_thread_team_data.organize_pool(pool, 1);
+    m_thread_team_data.organize_team(1);
   }
 }
-
-HostThreadTeamData* serial_get_thread_team_data() {
-  return &g_serial_thread_team_data;
-}
-
 }  // namespace Impl
-}  // namespace Kokkos
 
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
+Serial::Serial()
+#ifdef KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS
+    : m_space_instance(&Impl::SerialInternal::singleton()) {
+}
+#else
+    : m_space_instance(&Impl::SerialInternal::singleton(),
+                       [](Impl::SerialInternal*) {}) {
+}
+#endif
 
-bool Serial::impl_is_initialized() { return Impl::g_serial_is_initialized; }
+bool Serial::impl_is_initialized() {
+  return Impl::SerialInternal::singleton().is_initialized();
+}
 
 void Serial::impl_initialize() {
-  Impl::SharedAllocationRecord<void, void>::tracking_enable();
-
-  // Init the array of locks used for arbitrarily sized atomics
-  Impl::init_lock_array_host_space();
-
-  Impl::g_serial_is_initialized = true;
+  Impl::SerialInternal::singleton().initialize();
 }
 
-void Serial::impl_finalize() {
-  if (Impl::g_serial_thread_team_data.scratch_buffer()) {
-    Impl::g_serial_thread_team_data.disband_team();
-    Impl::g_serial_thread_team_data.disband_pool();
-
-    Kokkos::HostSpace space;
-
-    space.deallocate(Impl::g_serial_thread_team_data.scratch_buffer(),
-                     Impl::g_serial_thread_team_data.scratch_bytes());
-
-    Impl::g_serial_thread_team_data.scratch_assign(nullptr, 0, 0, 0, 0, 0);
-  }
-
-  Kokkos::Profiling::finalize();
-
-  Impl::g_serial_is_initialized = false;
-}
+void Serial::impl_finalize() { Impl::SerialInternal::singleton().finalize(); }
 
 const char* Serial::name() { return "Serial"; }
 
@@ -198,6 +210,9 @@ void SerialSpaceInitializer::finalize(const bool) {
 }
 
 void SerialSpaceInitializer::fence() { Kokkos::Serial::impl_static_fence(); }
+void SerialSpaceInitializer::fence(const std::string& name) {
+  Kokkos::Serial::impl_static_fence(name);
+}
 
 void SerialSpaceInitializer::print_configuration(std::ostream& msg,
                                                  const bool detail) {
diff --git a/packages/kokkos/core/src/impl/Kokkos_Serial_Task.hpp b/packages/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
index 3ac3899acaf9f3695025472fc5f02cb0708f64fb..be732f4486d4618b4f8601d1859be44ce1a31296 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
@@ -76,14 +76,18 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Serial, QueueType> > {
   static void execute(scheduler_type const& scheduler) {
     using task_base_type = typename scheduler_type::task_base_type;
 
+    auto const& serial_execution_space = scheduler.get_execution_space();
+
     // Set default buffers
-    serial_resize_thread_team_data(0,   /* global reduce buffer */
-                                   512, /* team reduce buffer */
-                                   0,   /* team shared buffer */
-                                   0    /* thread local buffer */
-    );
+    serial_execution_space.impl_internal_space_instance()
+        ->resize_thread_team_data(0,   /* global reduce buffer */
+                                  512, /* team reduce buffer */
+                                  0,   /* team shared buffer */
+                                  0    /* thread local buffer */
+        );
 
-    Impl::HostThreadTeamData& self = *Impl::serial_get_thread_team_data();
+    auto& self = serial_execution_space.impl_internal_space_instance()
+                     ->m_thread_team_data;
 
     auto& queue         = scheduler.queue();
     auto team_scheduler = scheduler.get_team_scheduler(0);
@@ -147,9 +151,11 @@ class TaskQueueSpecializationConstrained<
 
     task_base_type* const end = (task_base_type*)task_base_type::EndTag;
 
-    Impl::HostThreadTeamData* const data = Impl::serial_get_thread_team_data();
+    execution_space serial_execution_space;
+    auto& data = serial_execution_space.impl_internal_space_instance()
+                     ->m_thread_team_data;
 
-    member_type exec(scheduler, *data);
+    member_type exec(scheduler, data);
 
     // Loop until no runnable task
 
@@ -181,18 +187,22 @@ class TaskQueueSpecializationConstrained<
 
     task_base_type* const end = (task_base_type*)task_base_type::EndTag;
 
+    execution_space serial_execution_space;
+
     // Set default buffers
-    serial_resize_thread_team_data(0,   /* global reduce buffer */
-                                   512, /* team reduce buffer */
-                                   0,   /* team shared buffer */
-                                   0    /* thread local buffer */
-    );
+    serial_execution_space.impl_internal_space_instance()
+        ->resize_thread_team_data(0,   /* global reduce buffer */
+                                  512, /* team reduce buffer */
+                                  0,   /* team shared buffer */
+                                  0    /* thread local buffer */
+        );
 
     auto* const queue = scheduler.m_queue;
 
-    Impl::HostThreadTeamData* const data = Impl::serial_get_thread_team_data();
+    auto& data = serial_execution_space.impl_internal_space_instance()
+                     ->m_thread_team_data;
 
-    member_type exec(scheduler, *data);
+    member_type exec(scheduler, data);
 
     // Loop until all queues are empty
     while (0 < queue->m_ready_count) {
@@ -210,16 +220,6 @@ class TaskQueueSpecializationConstrained<
 
         (*task->m_apply)(task, &exec);
 
-#if 0
-        printf( "TaskQueue<Serial>::executed: 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
-        , uintptr_t(task)
-        , uintptr_t(task->m_wait)
-        , uintptr_t(task->m_next)
-        , task->m_task_type
-        , task->m_priority
-        , task->m_ref_count );
-#endif
-
         // If a respawn then re-enqueue otherwise the task is complete
         // and all tasks waiting on this task are updated.
         queue->complete(task);
diff --git a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
index 917ae72081c6a5eee98b4e02827097446cb29e0b..3efff98e459e8a8b92983d195a7a4486672bdce4 100644
--- a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
@@ -259,6 +259,9 @@ SharedAllocationRecord<void, void>* SharedAllocationRecord<
     while ((root_next = Kokkos::atomic_exchange(&arg_record->m_root->m_next,
                                                 zero)) == nullptr)
       ;
+    // We need a memory_fence() here so that the following update
+    // is properly sequenced
+    Kokkos::memory_fence();
 
     arg_record->m_next->m_prev = arg_record->m_prev;
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp b/packages/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp
index 0773a0914befe4e9db3b3b79ae3c446bcb0f3ad1..7f222c92ca704908e7e6be05229c976d113395f9 100644
--- a/packages/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp
@@ -55,7 +55,6 @@
 //----------------------------------------------------------------------------
 
 #include <Kokkos_MemoryPool.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 #include <Kokkos_Future.hpp>
 #include <impl/Kokkos_TaskQueue.hpp>
diff --git a/packages/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp b/packages/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp
index a0eccffb627f39f1810978aa0d3ab25c9458e4e8..0584cd29eb70470f4c206317d35a57f24893a518 100644
--- a/packages/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp
@@ -58,8 +58,7 @@
 
 #include <impl/Kokkos_TaskQueueMemoryManager.hpp>
 #include <impl/Kokkos_TaskQueueCommon.hpp>
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <impl/Kokkos_Atomic_Increment.hpp>
+#include <Kokkos_Atomic.hpp>
 #include <impl/Kokkos_OptionalRef.hpp>
 #include <impl/Kokkos_LIFO.hpp>
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_Tags.hpp b/packages/kokkos/core/src/impl/Kokkos_Tags.hpp
deleted file mode 100644
index eea4c938661afa00f4dad929312bf6cfa2b83776..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/impl/Kokkos_Tags.hpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_TAGS_HPP
-#define KOKKOS_TAGS_HPP
-
-#include <impl/Kokkos_Traits.hpp>
-#include <Kokkos_Core_fwd.hpp>
-#include <type_traits>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-/** KOKKOS_IMPL_HAS_TYPE( Type )
- *
- * defines a meta-function that check if a type expose an internal alias which
- * matches Type
- *
- * e.g.
- *   KOKKOS_IMPL_HAS_TYPE( array_layout );
- *   struct Foo { using array_layout = void; };
- *   have_array_layout<Foo>::value == 1;
- */
-#define KOKKOS_IMPL_HAS_TYPE(TYPE)                                             \
-  template <typename T>                                                        \
-  struct have_##TYPE {                                                         \
-   private:                                                                    \
-    template <typename U, typename = void>                                     \
-    struct X : std::false_type {};                                             \
-    template <typename U>                                                      \
-    struct X<U, typename std::conditional<true, void, typename X::TYPE>::type> \
-        : std::true_type {};                                                   \
-                                                                               \
-   public:                                                                     \
-    using type = typename X<T>::type;                                          \
-    enum : bool { value = type::value };                                       \
-  };
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template <typename T>
-using is_void = std::is_same<void, T>;
-
-}
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskBase.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskBase.hpp
index 2d0f62a563712a1182849fd8dc43349f6996a42e..06581052a8f687aeaff85d1368fd271407f0c36e 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskBase.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskBase.hpp
@@ -203,14 +203,17 @@ class TaskBase {
 
     // Assign dependence to m_next.  It will be processed in the subsequent
     // call to schedule.  Error if the dependence is reset.
-    if (lock != Kokkos::atomic_exchange(&m_next, dep)) {
+    if (lock != Kokkos::Impl::desul_atomic_exchange(
+                    &m_next, dep, Kokkos::Impl::MemoryOrderSeqCst(),
+                    Kokkos::Impl::MemoryScopeDevice())) {
       Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
     }
-
     if (nullptr != dep) {
       // The future may be destroyed upon returning from this call
       // so increment reference count to track this assignment.
-      Kokkos::atomic_increment(&(dep->m_ref_count));
+      Kokkos::Impl::desul_atomic_inc(&(dep->m_ref_count),
+                                     Kokkos::Impl::MemoryOrderSeqCst(),
+                                     Kokkos::Impl::MemoryScopeDevice());
     }
   }
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskNode.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskNode.hpp
index 42afa93cdcc4db4f4c0223d7b85f5edb8256ee31..caf1d0a84b82e3e6b121476c8cfd0213775dd69f 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskNode.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskNode.hpp
@@ -151,6 +151,7 @@ class ReferenceCountedBase {
   bool decrement_and_check_reference_count() {
     // TODO @tasking @memory_order DSH memory order
     auto old_count = Kokkos::atomic_fetch_add(&m_ref_count, -1);
+    Kokkos::memory_fence();
 
     KOKKOS_ASSERT(old_count > 0 && "reference count greater less than zero!");
 
@@ -158,7 +159,11 @@ class ReferenceCountedBase {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void increment_reference_count() { Kokkos::atomic_increment(&m_ref_count); }
+  void increment_reference_count() {
+    Kokkos::Impl::desul_atomic_inc(&m_ref_count,
+                                   Kokkos::Impl::MemoryOrderSeqCst(),
+                                   Kokkos::Impl::MemoryScopeDevice());
+  }
 };
 
 template <class TaskQueueTraits, class SchedulingInfo>
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
index c0d2eca9c106305e1bfdeb8efd634f657df90c8b..e74e84a2e535b5953ae58667a2a7f4b4f53b293d 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
@@ -58,8 +58,7 @@
 #include <impl/Kokkos_TaskBase.hpp>
 #include <impl/Kokkos_TaskResult.hpp>
 
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <impl/Kokkos_Atomic_Increment.hpp>
+#include <Kokkos_Atomic.hpp>
 #include <impl/Kokkos_OptionalRef.hpp>
 #include <impl/Kokkos_LIFO.hpp>
 
@@ -188,25 +187,11 @@ class TaskQueue : public TaskQueueBase {
   // Assign task pointer with reference counting of assigned tasks
   KOKKOS_FUNCTION static void assign(task_root_type** const lhs,
                                      task_root_type* const rhs) {
-#if 0
-  {
-    printf( "assign( 0x%lx { 0x%lx %d %d } , 0x%lx { 0x%lx %d %d } )\n"
-          , uintptr_t( lhs ? *lhs : 0 )
-          , uintptr_t( lhs && *lhs ? (*lhs)->m_next : 0 )
-          , int( lhs && *lhs ? (*lhs)->m_task_type : 0 )
-          , int( lhs && *lhs ? (*lhs)->m_ref_count : 0 )
-          , uintptr_t(rhs)
-          , uintptr_t( rhs ? rhs->m_next : 0 )
-          , int( rhs ? rhs->m_task_type : 0 )
-          , int( rhs ? rhs->m_ref_count : 0 )
-          );
-    fflush( stdout );
-  }
-#endif
-
     if (*lhs) decrement(*lhs);
     if (rhs) {
-      Kokkos::atomic_increment(&(rhs->m_ref_count));
+      Kokkos::Impl::desul_atomic_inc(&rhs->m_ref_count,
+                                     Kokkos::Impl::MemoryOrderSeqCst(),
+                                     Kokkos::Impl::MemoryScopeDevice());
     }
 
     // Force write of *lhs
@@ -234,13 +219,7 @@ class TaskQueue : public TaskQueueBase {
 
     using task_type = Impl::Task<execution_space, value_type, FunctorType>;
 
-    enum : size_t { align = (1 << 4), align_mask = align - 1 };
-    enum : size_t { task_size = sizeof(task_type) };
-    enum : size_t { result_size = Impl::TaskResult<value_type>::size };
-    enum : size_t {
-      alloc_size = ((task_size + align_mask) & ~align_mask) +
-                   ((result_size + align_mask) & ~align_mask)
-    };
+    constexpr size_t task_size = sizeof(task_type);
 
     return m_memory.allocate_block_size(task_size);
   }
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp
index cae06d4ea5ca17b5924a7bbf8c415d6f0a3ab070..757e5f98864bc3c74faa8f4bdfffb795e68aab60 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp
@@ -57,8 +57,7 @@
 #include <impl/Kokkos_TaskResult.hpp>
 
 #include <impl/Kokkos_TaskQueueMemoryManager.hpp>
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <impl/Kokkos_Atomic_Increment.hpp>
+#include <Kokkos_Atomic.hpp>
 #include <impl/Kokkos_OptionalRef.hpp>
 #include <impl/Kokkos_LIFO.hpp>
 
@@ -88,6 +87,7 @@ class TaskQueueCommonMixin {
   // <editor-fold desc="Constructors, destructor, and assignment"> {{{2
 
   TaskQueueCommonMixin() : m_ready_count(0) {
+    Kokkos::memory_fence();
     // TODO @tasking @memory_order DSH figure out if I need this store to be
     // atomic
   }
@@ -158,14 +158,17 @@ class TaskQueueCommonMixin {
   KOKKOS_INLINE_FUNCTION
   void _increment_ready_count() {
     // TODO @tasking @memory_order DSH memory order
-    Kokkos::atomic_increment(&this->m_ready_count);
+    Kokkos::Impl::desul_atomic_inc(&this->m_ready_count,
+                                   Kokkos::Impl::MemoryOrderSeqCst(),
+                                   Kokkos::Impl::MemoryScopeDevice());
   }
 
   KOKKOS_INLINE_FUNCTION
   void _decrement_ready_count() {
     // TODO @tasking @memory_order DSH memory order
-    Kokkos::atomic_decrement(&this->m_ready_count);
-    Kokkos::memory_fence();
+    Kokkos::Impl::desul_atomic_dec(&this->m_ready_count,
+                                   Kokkos::Impl::MemoryOrderSeqCst(),
+                                   Kokkos::Impl::MemoryScopeDevice());
   }
 
  public:
@@ -476,7 +479,7 @@ class TaskQueueCommonMixin {
   }
 
   template <class ExecutionSpace, class MemorySpace, class MemoryPool>
-  static /* KOKKOS_CONSTEXPR_14 */ size_t task_queue_allocation_size(
+  static /* constexpr */ size_t task_queue_allocation_size(
       ExecutionSpace const&, MemorySpace const&, MemoryPool const&)
   // requires Same<ExecutionSpace, typename Derived::execution_space>
   //            && Same<MemorySpace, typename Derived::memory_space>
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp
index 6e2481f93567a671a5ad66f3536b45c009286eca..3a71aa17e69042c791e9e7302c3c14cdca91b8aa 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp
@@ -56,8 +56,7 @@
 #include <impl/Kokkos_TaskBase.hpp>
 #include <impl/Kokkos_TaskResult.hpp>
 
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <impl/Kokkos_Atomic_Increment.hpp>
+#include <Kokkos_Atomic.hpp>
 #include <impl/Kokkos_OptionalRef.hpp>
 #include <impl/Kokkos_LIFO.hpp>
 
@@ -103,8 +102,9 @@ class TaskQueueMemoryManager : public TaskQueueBase {
     } else {
       void* data = m_pool.allocate(static_cast<size_t>(requested_size));
 
-      // Kokkos::atomic_increment(&m_accum_alloc); // memory_order_relaxed
-      Kokkos::atomic_increment(&m_count_alloc);  // memory_order_relaxed
+      Kokkos::Impl::desul_atomic_inc(
+          &m_count_alloc, Kokkos::Impl::MemoryOrderSeqCst(),
+          Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
       // TODO @tasking @minor DSH make this thread safe? (otherwise, it's just
       // an approximation, which is probably fine...)
       if (m_max_alloc < m_count_alloc) m_max_alloc = m_count_alloc;
@@ -200,7 +200,9 @@ class TaskQueueMemoryManager : public TaskQueueBase {
   KOKKOS_INLINE_FUNCTION void deallocate(
       PoolAllocatedObjectBase<CountType>&& obj) {
     m_pool.deallocate((void*)&obj, 1);
-    Kokkos::atomic_decrement(&m_count_alloc);  // memory_order_relaxed
+    Kokkos::Impl::desul_atomic_dec(
+        &m_count_alloc, Kokkos::Impl::MemoryOrderSeqCst(),
+        Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
   }
 
   KOKKOS_INLINE_FUNCTION
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp
index efee3d051dc8fb4e112219527bb322404ff4dfe6..5f98e8d85e9214289ce43f98b920ec27da4a672f 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp
@@ -59,9 +59,7 @@
 #include <impl/Kokkos_TaskResult.hpp>
 #include <impl/Kokkos_TaskQueue.hpp>
 
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <impl/Kokkos_Atomic_Increment.hpp>
-#include <impl/Kokkos_Atomic_Decrement.hpp>
+#include <Kokkos_Atomic.hpp>
 
 #include <string>
 #include <typeinfo>
@@ -159,8 +157,14 @@ class TaskQueueMultiple : public TaskQueue<ExecSpace, MemorySpace> {
               // task stolen.
               // first increment our ready count, then decrement the ready count
               // on the other queue:
-              Kokkos::atomic_increment(&this->m_ready_count);
-              Kokkos::atomic_decrement(&steal_from.m_ready_count);
+              Kokkos::Impl::desul_atomic_inc(
+                  &this->m_ready_count, Kokkos::Impl::MemoryOrderSeqCst(),
+                  Kokkos::Impl::MemoryScopeDevice());  // TODO?
+                                                       // memory_order_relaxed
+              Kokkos::Impl::desul_atomic_dec(
+                  &steal_from.m_ready_count, Kokkos::Impl::MemoryOrderSeqCst(),
+                  Kokkos::Impl::MemoryScopeDevice());  // TODO?
+                                                       // memory_order_relaxed
               return rv;
             }
           }
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
index a87e5f72721f95f06d1c9f90c17e92d0e1fec2fb..324227cf5e615f184492de9471fa0f78700ae11a 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
@@ -105,6 +105,7 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::decrement(
   task_root_type volatile &t = *task;
 
   const int count = Kokkos::atomic_fetch_add(&(t.m_ref_count), -1);
+  Kokkos::memory_fence();
 
 #if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
   if (1 == count) {
@@ -146,8 +147,9 @@ KOKKOS_FUNCTION void *TaskQueue<ExecSpace, MemorySpace>::allocate(size_t n) {
   void *const p = m_memory.allocate(n);
 
   if (p) {
-    // Kokkos::atomic_increment( & m_accum_alloc );
-    Kokkos::atomic_increment(&m_count_alloc);
+    Kokkos::Impl::desul_atomic_inc(
+        &m_count_alloc, Kokkos::Impl::MemoryOrderSeqCst(),
+        Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
 
     // if ( m_max_alloc < m_count_alloc ) m_max_alloc = m_count_alloc ;
   }
@@ -159,7 +161,9 @@ template <typename ExecSpace, typename MemorySpace>
 KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::deallocate(void *p,
                                                                    size_t n) {
   m_memory.deallocate(p, n);
-  Kokkos::atomic_decrement(&m_count_alloc);
+  Kokkos::Impl::desul_atomic_dec(
+      &m_count_alloc, Kokkos::Impl::MemoryOrderSeqCst(),
+      Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
 }
 
 //----------------------------------------------------------------------------
@@ -210,7 +214,9 @@ KOKKOS_FUNCTION bool TaskQueue<ExecSpace, MemorySpace>::push_task(
     //     *queue = task;
     //   }
     //   old_head = *queue;
-    old_head = Kokkos::atomic_compare_exchange(queue, old_head, task);
+    old_head = Kokkos::Impl::desul_atomic_compare_exchange(
+        const_cast<task_root_type **>(queue), old_head, task,
+        Kokkos::Impl::MemoryOrderSeqCst(), Kokkos::Impl::MemoryScopeDevice());
 
     if (old_head_tmp == old_head) return true;
   }
@@ -258,7 +264,10 @@ TaskQueue<ExecSpace, MemorySpace>::pop_ready_task(
 
     task_root_type *const x = task;
 
-    task = Kokkos::atomic_compare_exchange(queue, x, lock);
+    //    task = Kokkos::atomic_compare_exchange(queue, x, lock);
+    task = Kokkos::Impl::desul_atomic_compare_exchange(
+        const_cast<task_root_type **>(queue), x, lock,
+        Kokkos::Impl::MemoryOrderSeqCst(), Kokkos::Impl::MemoryScopeDevice());
 
     if (x == task) {
       // CAS succeeded and queue is locked
@@ -274,6 +283,8 @@ TaskQueue<ExecSpace, MemorySpace>::pop_ready_task(
       // This thread has exclusive access to
       // the queue and the popped task's m_next.
 
+      Kokkos::memory_fence();
+
       task_root_type *volatile &next = task->m_next;
 
       // This algorithm is not lockfree because a adversarial scheduler could
@@ -400,7 +411,9 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::schedule_runnable(
     // to track number of ready + executing tasks.
     // The ready count will be decremented when the task is complete.
 
-    Kokkos::atomic_increment(&m_ready_count);
+    Kokkos::Impl::desul_atomic_inc(
+        &m_ready_count, Kokkos::Impl::MemoryOrderSeqCst(),
+        Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
 
     task_root_type *volatile *const ready_queue =
         &m_ready[t.m_priority][t.m_task_type];
@@ -553,8 +566,9 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::reschedule(
 
   task_root_type *const zero = nullptr;
   task_root_type *const lock = (task_root_type *)task_root_type::LockTag;
-
-  if (lock != Kokkos::atomic_exchange(&task->m_next, zero)) {
+  if (lock != Kokkos::Impl::desul_atomic_exchange(
+                  &task->m_next, zero, Kokkos::Impl::MemoryOrderSeqCst(),
+                  Kokkos::Impl::MemoryScopeDevice())) {
     Kokkos::abort("TaskScheduler::respawn ERROR: already respawned");
   }
 }
@@ -601,8 +615,9 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::complete(
 
     // Stop other tasks from adding themselves to this task's wait queue
     // by locking the head of this task's wait queue.
-
-    task_root_type *x = Kokkos::atomic_exchange(&t.m_wait, lock);
+    task_root_type *x = Kokkos::Impl::desul_atomic_exchange(
+        const_cast<task_root_type **>(&t.m_wait), lock,
+        Kokkos::Impl::MemoryOrderSeqCst(), Kokkos::Impl::MemoryScopeDevice());
 
     if (x != (task_root_type *)lock) {
       // This thread has transitioned this 'task' to complete.
@@ -645,7 +660,9 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::complete(
     // A runnable task was popped from a ready queue and executed.
     // If respawned into a ready queue then the ready count was incremented
     // so decrement whether respawned or not.
-    Kokkos::atomic_decrement(&m_ready_count);
+    Kokkos::Impl::desul_atomic_dec(
+        &m_ready_count, Kokkos::Impl::MemoryOrderSeqCst(),
+        Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
   }
 }
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp
index 2faab5794907ecd43ccead5f74e09e414a8f3541..f53dfe5a96621a0e31d3deb95bb83ac9bef35907 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp
@@ -55,7 +55,6 @@
 //----------------------------------------------------------------------------
 
 #include <Kokkos_MemoryPool.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 #include <Kokkos_Future.hpp>
 #include <impl/Kokkos_TaskQueue.hpp>
diff --git a/packages/kokkos/core/src/impl/Kokkos_Timer.hpp b/packages/kokkos/core/src/impl/Kokkos_Timer.hpp
index e8004ff85258975d3f36ee2a9345414e27ea09ad..6edf571d7892e4260fc7d617a1a86a63d265baa2 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Timer.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Timer.hpp
@@ -45,8 +45,13 @@
 #ifndef KOKKOS_IMPLWALLTIME_HPP
 #define KOKKOS_IMPLWALLTIME_HPP
 
+#include <Kokkos_Macros.hpp>
+
+KOKKOS_IMPL_WARNING("This file is deprecated. Use <Kokkos_Timer.hpp> instead.")
+
 #include <Kokkos_Timer.hpp>
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 namespace Kokkos {
 namespace Impl {
 
@@ -54,10 +59,11 @@ namespace Impl {
  *   Timer promoted from Impl to Kokkos ns
  *   This file included for backwards compatibility
  */
-
-using Kokkos::Timer;
+using Timer KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::Timer instead!") =
+    Kokkos::Timer;
 
 }  // namespace Impl
 }  // namespace Kokkos
+#endif
 
 #endif /* #ifndef KOKKOS_IMPLWALLTIME_HPP */
diff --git a/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp b/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp
index cb8cf281ae06fe0a71862b47428a2ffa12f4bd67..bea7c2c9d1e56a61802bc47c60f00e82496c8061 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp
@@ -65,13 +65,6 @@ struct identity {
 template <typename T>
 using identity_t = typename identity<T>::type;
 
-struct not_a_type {
-  not_a_type()                  = delete;
-  ~not_a_type()                 = delete;
-  not_a_type(not_a_type const&) = delete;
-  void operator=(not_a_type const&) = delete;
-};
-
 #if defined(__cpp_lib_void_t)
 // since C++17
 using std::void_t;
@@ -158,6 +151,112 @@ struct destruct_delete {
 template <class...>
 struct type_list;
 
+//------------------------------------------------------------------------------
+// <editor-fold desc="type_list_remove_first"> {{{2
+
+// Currently linear complexity; if we use this a lot, maybe make it better?
+
+template <class Entry, class InList, class OutList>
+struct _type_list_remove_first_impl;
+
+template <class Entry, class T, class... Ts, class... OutTs>
+struct _type_list_remove_first_impl<Entry, type_list<T, Ts...>,
+                                    type_list<OutTs...>>
+    : _type_list_remove_first_impl<Entry, type_list<Ts...>,
+                                   type_list<OutTs..., T>> {};
+
+template <class Entry, class... Ts, class... OutTs>
+struct _type_list_remove_first_impl<Entry, type_list<Entry, Ts...>,
+                                    type_list<OutTs...>>
+    : _type_list_remove_first_impl<Entry, type_list<>,
+                                   type_list<OutTs..., Ts...>> {};
+
+template <class Entry, class... OutTs>
+struct _type_list_remove_first_impl<Entry, type_list<>, type_list<OutTs...>>
+    : identity<type_list<OutTs...>> {};
+
+template <class Entry, class List>
+struct type_list_remove_first
+    : _type_list_remove_first_impl<Entry, List, type_list<>> {};
+
+// </editor-fold> end type_list_remove_first }}}2
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// <editor-fold desc="type_list_any"> {{{2
+
+template <template <class> class UnaryPred, class List>
+struct type_list_any;
+
+#ifdef KOKKOS_ENABLE_CXX17
+template <template <class> class UnaryPred, class... Ts>
+struct type_list_any<UnaryPred, type_list<Ts...>>
+    : std::bool_constant<(UnaryPred<Ts>::value || ...)> {};
+#else
+template <template <class> class UnaryPred, class T, class... Ts>
+struct type_list_any<UnaryPred, type_list<T, Ts...>> {
+  using type = typename std::conditional_t<
+      UnaryPred<T>::value, std::true_type,
+      type_list_any<UnaryPred, type_list<Ts...>>>::type;
+  static constexpr auto value = type::value;
+};
+
+template <template <class> class UnaryPred>
+struct type_list_any<UnaryPred, type_list<>> : std::false_type {};
+
+#endif
+
+// </editor-fold> end type_list_any }}}2
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// <editor-fold desc="concat_type_list"> {{{2
+//  concat_type_list combines types in multiple type_lists
+
+// forward declaration
+template <typename... T>
+struct concat_type_list;
+
+// alias
+template <typename... T>
+using concat_type_list_t = typename concat_type_list<T...>::type;
+
+// final instantiation
+template <typename... T>
+struct concat_type_list<type_list<T...>> {
+  using type = type_list<T...>;
+};
+
+// combine consecutive type_lists
+template <typename... T, typename... U, typename... Tail>
+struct concat_type_list<type_list<T...>, type_list<U...>, Tail...>
+    : concat_type_list<type_list<T..., U...>, Tail...> {};
+// </editor-fold> end concat_type_list }}}2
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// <editor-fold desc="filter_type_list"> {{{2
+//  filter_type_list generates type-list of types which satisfy
+//  PredicateT<T>::value == ValueT
+
+template <template <typename> class PredicateT, typename TypeListT,
+          bool ValueT = true>
+struct filter_type_list;
+
+template <template <typename> class PredicateT, typename... T, bool ValueT>
+struct filter_type_list<PredicateT, type_list<T...>, ValueT> {
+  using type =
+      concat_type_list_t<std::conditional_t<PredicateT<T>::value == ValueT,
+                                            type_list<T>, type_list<>>...>;
+};
+
+template <template <typename> class PredicateT, typename T, bool ValueT = true>
+using filter_type_list_t =
+    typename filter_type_list<PredicateT, T, ValueT>::type;
+
+// </editor-fold> end filter_type_list }}}2
+//------------------------------------------------------------------------------
+
 // </editor-fold> end type_list }}}1
 //==============================================================================
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_VLAEmulation.hpp b/packages/kokkos/core/src/impl/Kokkos_VLAEmulation.hpp
index 41607a2a8e7fedc56fd92fdfa9a472bda10bc547..ace826dd5a7f726cd9e0e2b3ce14b081a26680f2 100644
--- a/packages/kokkos/core/src/impl/Kokkos_VLAEmulation.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_VLAEmulation.hpp
@@ -130,20 +130,20 @@ struct ObjectWithVLAEmulation {
   // CRTP boilerplate
 
   KOKKOS_FORCEINLINE_FUNCTION
-  /* KOKKOS_CONSTEXPR_14 */
+  /* constexpr */
   Derived* _this() noexcept {
     return VLAEmulationAccess::_cast_to_derived(this);
   }
 
   KOKKOS_FORCEINLINE_FUNCTION
-  /* KOKKOS_CONSTEXPR_14 */
+  /* constexpr */
   Derived const* _this() const noexcept {
     return VLAEmulationAccess::_cast_to_derived(this);
   }
 
   // Note: can't be constexpr because of reinterpret_cast
   KOKKOS_FORCEINLINE_FUNCTION
-  /* KOKKOS_CONSTEXPR_14 */
+  /* constexpr */
   vla_value_type* _vla_pointer() noexcept {
     // The data starts right after the aligned storage of Derived
     return reinterpret_cast<vla_value_type*>(_this() + 1);
@@ -151,7 +151,7 @@ struct ObjectWithVLAEmulation {
 
   // Note: can't be constexpr because of reinterpret_cast
   KOKKOS_FORCEINLINE_FUNCTION
-  /* KOKKOS_CONSTEXPR_14 */
+  /* constexpr */
   vla_value_type const* _vla_pointer() const noexcept {
     // The data starts right after the aligned storage of Derived
     return reinterpret_cast<vla_value_type const*>(_this() + 1);
@@ -159,7 +159,7 @@ struct ObjectWithVLAEmulation {
 
  public:
   KOKKOS_INLINE_FUNCTION
-  static /* KOKKOS_CONSTEXPR_14 */ size_t required_allocation_size(
+  static /* constexpr */ size_t required_allocation_size(
       vla_entry_count_type num_vla_entries) {
     KOKKOS_EXPECTS(num_vla_entries >= 0);
     return sizeof(Derived) + num_vla_entries * sizeof(VLAValueType);
diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
index b9e32a04e09afcf1a5fcbeba0bd81257631f7714..797b3f584b6234b290a809e7f1c502e04249f058 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
@@ -144,10 +144,10 @@ struct ViewCtorProp<typename std::enable_if<is_view_label<Label>::value>::type,
 };
 
 template <typename Space>
-struct ViewCtorProp<typename std::enable_if<
-                        Kokkos::Impl::is_memory_space<Space>::value ||
-                        Kokkos::Impl::is_execution_space<Space>::value>::type,
-                    Space> {
+struct ViewCtorProp<
+    typename std::enable_if<Kokkos::is_memory_space<Space>::value ||
+                            Kokkos::is_execution_space<Space>::value>::type,
+    Space> {
   ViewCtorProp()                     = default;
   ViewCtorProp(const ViewCtorProp &) = default;
   ViewCtorProp &operator=(const ViewCtorProp &) = default;
@@ -207,10 +207,10 @@ template <typename... P>
 struct ViewCtorProp : public ViewCtorProp<void, P>... {
  private:
   using var_memory_space =
-      Kokkos::Impl::has_condition<void, Kokkos::Impl::is_memory_space, P...>;
+      Kokkos::Impl::has_condition<void, Kokkos::is_memory_space, P...>;
 
   using var_execution_space =
-      Kokkos::Impl::has_condition<void, Kokkos::Impl::is_execution_space, P...>;
+      Kokkos::Impl::has_condition<void, Kokkos::is_execution_space, P...>;
 
   struct VOIDDUMMY {};
 
@@ -270,7 +270,6 @@ struct ViewCtorProp : public ViewCtorProp<void, P>... {
 
 namespace Kokkos {
 
-/* For backward compatibility */
 namespace Impl {
 struct ViewAllocateWithoutInitializingBackwardCompat {};
 
@@ -291,7 +290,6 @@ struct ViewCtorProp<WithoutInitializing_t, std::string,
 };
 } /* namespace Impl */
 
-/*[[deprecated(Use Kokkos::alloc(Kokkos::WithoutInitializing, label) instead]]*/
 using ViewAllocateWithoutInitializing =
     Impl::ViewCtorProp<Impl::WithoutInitializing_t, std::string,
                        Impl::ViewAllocateWithoutInitializingBackwardCompat>;
diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
index a380a306931f4150e95b6f433c8bb076b091c456..9523118748f09907933764aea6ca94a085e68a9d 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
@@ -49,6 +49,7 @@
 #include <initializer_list>
 
 #include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_DetectionIdiom.hpp>
 #include <Kokkos_Pair.hpp>
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_Extents.hpp>
@@ -862,7 +863,7 @@ struct ViewDataAnalysis {
 namespace Kokkos {
 namespace Impl {
 
-template <class Dimension, class Layout, typename Enable = void>
+template <class Dimension, class Layout, class Enable = void>
 struct ViewOffset {
   using is_mapping_plugin = std::false_type;
 };
@@ -1389,7 +1390,8 @@ struct ViewOffset<
     KOKKOS_INLINE_FUNCTION
     static constexpr size_t stride(size_t const N) {
       return ((align != 0) &&
-              ((Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align) < N) &&
+              ((static_cast<int>(Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD) *
+                static_cast<int>(align)) < N) &&
               ((N % div_ok) != 0))
                  ? N + align - (N % div_ok)
                  : N;
@@ -2022,7 +2024,8 @@ struct ViewOffset<
     KOKKOS_INLINE_FUNCTION
     static constexpr size_t stride(size_t const N) {
       return ((align != 0) &&
-              ((Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align) < N) &&
+              ((static_cast<int>(Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD) *
+                static_cast<int>(align)) < N) &&
               ((N % div_ok) != 0))
                  ? N + align - (N % div_ok)
                  : N;
@@ -2816,6 +2819,22 @@ struct ViewDataHandle<
 namespace Kokkos {
 namespace Impl {
 
+template <typename T>
+inline bool is_zero_byte(const T& t) {
+  using comparison_type = std::conditional_t<
+      sizeof(T) % sizeof(long long int) == 0, long long int,
+      std::conditional_t<
+          sizeof(T) % sizeof(long int) == 0, long int,
+          std::conditional_t<
+              sizeof(T) % sizeof(int) == 0, int,
+              std::conditional_t<sizeof(T) % sizeof(short int) == 0, short int,
+                                 char>>>>;
+  const auto* const ptr = reinterpret_cast<const comparison_type*>(&t);
+  for (std::size_t i = 0; i < sizeof(T) / sizeof(comparison_type); ++i)
+    if (ptr[i] != 0) return false;
+  return true;
+}
+
 //----------------------------------------------------------------------------
 
 /*
@@ -2826,16 +2845,16 @@ namespace Impl {
  *  called from the shared memory tracking destruction.
  *  Secondarily to have two fewer partial specializations.
  */
-template <class ExecSpace, class ValueType,
+template <class DeviceType, class ValueType,
           bool IsScalar = std::is_scalar<ValueType>::value>
 struct ViewValueFunctor;
 
-template <class ExecSpace, class ValueType>
-struct ViewValueFunctor<ExecSpace, ValueType, false /* is_scalar */> {
+template <class DeviceType, class ValueType>
+struct ViewValueFunctor<DeviceType, ValueType, false /* is_scalar */> {
+  using ExecSpace  = typename DeviceType::execution_space;
   using PolicyType = Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<int64_t>>;
-  using Exec       = typename ExecSpace::execution_space;
 
-  Exec space;
+  ExecSpace space;
   ValueType* ptr;
   size_t n;
   bool destroy;
@@ -2864,11 +2883,50 @@ struct ViewValueFunctor<ExecSpace, ValueType, false /* is_scalar */> {
         destroy(false),
         name(std::move(arg_name)) {}
 
-  void execute(bool arg) {
+  template <typename Dummy = ValueType>
+  std::enable_if_t<std::is_trivial<Dummy>::value &&
+                   std::is_trivially_copy_assignable<ValueType>::value>
+  construct_dispatch() {
+    ValueType value{};
+    if (Impl::is_zero_byte(value)) {
+      uint64_t kpID = 0;
+      if (Kokkos::Profiling::profileLibraryLoaded()) {
+        // We are not really using parallel_for here but using beginParallelFor
+        // instead of begin_parallel_for (and adding "via memset") is the best
+        // we can do to indicate that this is not supposed to be tunable (and
+        // doesn't really execute a parallel_for).
+        Kokkos::Profiling::beginParallelFor(
+            "Kokkos::View::initialization [" + name + "] via memset",
+            Kokkos::Profiling::Experimental::device_id(space), &kpID);
+      }
+
+      (void)ZeroMemset<ExecSpace, ValueType*, typename DeviceType::memory_space,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>(
+          space,
+          Kokkos::View<ValueType*, typename DeviceType::memory_space,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>(ptr, n),
+          value);
+
+      if (Kokkos::Profiling::profileLibraryLoaded()) {
+        Kokkos::Profiling::endParallelFor(kpID);
+      }
+    } else {
+      parallel_for_implementation(false);
+    }
+  }
+
+  template <typename Dummy = ValueType>
+  std::enable_if_t<!(std::is_trivial<Dummy>::value &&
+                     std::is_trivially_copy_assignable<ValueType>::value)>
+  construct_dispatch() {
+    parallel_for_implementation(false);
+  }
+
+  void parallel_for_implementation(bool arg) {
     destroy = arg;
-    PolicyType policy(0, n);
-    std::string functor_name;
     if (!space.in_parallel()) {
+      PolicyType policy(0, n);
+      std::string functor_name;
       uint64_t kpID = 0;
       if (Kokkos::Profiling::profileLibraryLoaded()) {
         functor_name =
@@ -2877,6 +2935,7 @@ struct ViewValueFunctor<ExecSpace, ValueType, false /* is_scalar */> {
         Kokkos::Tools::Impl::begin_parallel_for(policy, *this, functor_name,
                                                 kpID);
       }
+
 #ifdef KOKKOS_ENABLE_CUDA
       if (std::is_same<ExecSpace, Kokkos::Cuda>::value) {
         Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n,
@@ -2886,7 +2945,7 @@ struct ViewValueFunctor<ExecSpace, ValueType, false /* is_scalar */> {
       const Kokkos::Impl::ParallelFor<ViewValueFunctor, PolicyType> closure(
           *this, policy);
       closure.execute();
-      space.fence();
+      space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence");
       if (Kokkos::Profiling::profileLibraryLoaded()) {
         Kokkos::Tools::Impl::end_parallel_for(policy, *this, functor_name,
                                               kpID);
@@ -2896,13 +2955,14 @@ struct ViewValueFunctor<ExecSpace, ValueType, false /* is_scalar */> {
     }
   }
 
-  void construct_shared_allocation() { execute(false); }
+  void construct_shared_allocation() { construct_dispatch(); }
 
-  void destroy_shared_allocation() { execute(true); }
+  void destroy_shared_allocation() { parallel_for_implementation(true); }
 };
 
-template <class ExecSpace, class ValueType>
-struct ViewValueFunctor<ExecSpace, ValueType, true /* is_scalar */> {
+template <class DeviceType, class ValueType>
+struct ViewValueFunctor<DeviceType, ValueType, true /* is_scalar */> {
+  using ExecSpace  = typename DeviceType::execution_space;
   using PolicyType = Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<int64_t>>;
 
   ExecSpace space;
@@ -2921,12 +2981,54 @@ struct ViewValueFunctor<ExecSpace, ValueType, true /* is_scalar */> {
                    size_t const arg_n, std::string arg_name)
       : space(arg_space), ptr(arg_ptr), n(arg_n), name(std::move(arg_name)) {}
 
-  void construct_shared_allocation() {
-    if (!space.in_parallel()) {
+  template <typename Dummy = ValueType>
+  std::enable_if_t<std::is_trivial<Dummy>::value &&
+                   std::is_trivially_copy_assignable<Dummy>::value>
+  construct_shared_allocation() {
+    // Shortcut for zero initialization
+    ValueType value{};
+    if (Impl::is_zero_byte(value)) {
       uint64_t kpID = 0;
       if (Kokkos::Profiling::profileLibraryLoaded()) {
+        // We are not really using parallel_for here but using beginParallelFor
+        // instead of begin_parallel_for (and adding "via memset") is the best
+        // we can do to indicate that this is not supposed to be tunable (and
+        // doesn't really execute a parallel_for).
         Kokkos::Profiling::beginParallelFor(
-            "Kokkos::View::initialization [" + name + "]", 0, &kpID);
+            "Kokkos::View::initialization [" + name + "] via memset",
+            Kokkos::Profiling::Experimental::device_id(space), &kpID);
+      }
+
+      (void)ZeroMemset<ExecSpace, ValueType*, typename DeviceType::memory_space,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>(
+          space,
+          Kokkos::View<ValueType*, typename DeviceType::memory_space,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>(ptr, n),
+          value);
+
+      if (Kokkos::Profiling::profileLibraryLoaded()) {
+        Kokkos::Profiling::endParallelFor(kpID);
+      }
+    } else {
+      parallel_for_implementation();
+    }
+  }
+
+  template <typename Dummy = ValueType>
+  std::enable_if_t<!(std::is_trivial<Dummy>::value &&
+                     std::is_trivially_copy_assignable<Dummy>::value)>
+  construct_shared_allocation() {
+    parallel_for_implementation();
+  }
+
+  void parallel_for_implementation() {
+    if (!space.in_parallel()) {
+      PolicyType policy(0, n);
+      std::string functor_name = "Kokkos::View::initialization [" + name + "]";
+      uint64_t kpID            = 0;
+      if (Kokkos::Profiling::profileLibraryLoaded()) {
+        Kokkos::Tools::Impl::begin_parallel_for(policy, *this, functor_name,
+                                                kpID);
       }
 #ifdef KOKKOS_ENABLE_CUDA
       if (std::is_same<ExecSpace, Kokkos::Cuda>::value) {
@@ -2937,9 +3039,11 @@ struct ViewValueFunctor<ExecSpace, ValueType, true /* is_scalar */> {
       const Kokkos::Impl::ParallelFor<ViewValueFunctor, PolicyType> closure(
           *this, PolicyType(0, n));
       closure.execute();
-      space.fence();
+      space.fence(
+          "Kokkos::Impl::ViewValueFunctor: Fence after setting values in view");
       if (Kokkos::Profiling::profileLibraryLoaded()) {
-        Kokkos::Profiling::endParallelFor(kpID);
+        Kokkos::Tools::Impl::end_parallel_for(policy, *this, functor_name,
+                                              kpID);
       }
     } else {
       for (size_t i = 0; i < n; ++i) operator()(i);
@@ -3232,7 +3336,9 @@ class ViewMapping<
     using execution_space = typename alloc_prop::execution_space;
     using memory_space    = typename Traits::memory_space;
     using value_type      = typename Traits::value_type;
-    using functor_type    = ViewValueFunctor<execution_space, value_type>;
+    using functor_type =
+        ViewValueFunctor<Kokkos::Device<execution_space, memory_space>,
+                         value_type>;
     using record_type =
         Kokkos::Impl::SharedAllocationRecord<memory_space, functor_type>;
 
@@ -3314,17 +3420,10 @@ class ViewMapping<
                            Kokkos::LayoutStride>::value))))>::type> {
  private:
   enum {
-    is_assignable_space =
-#if 1
-        Kokkos::Impl::MemorySpaceAccess<
-            typename DstTraits::memory_space,
-            typename SrcTraits::memory_space>::assignable
-  };
-#else
-        std::is_same<typename DstTraits::memory_space,
-                     typename SrcTraits::memory_space>::value
+    is_assignable_space = Kokkos::Impl::MemorySpaceAccess<
+        typename DstTraits::memory_space,
+        typename SrcTraits::memory_space>::assignable
   };
-#endif
 
   enum {
     is_assignable_value_type =
@@ -3728,7 +3827,7 @@ class ViewMapping<
 
   template <class MemoryTraits>
   struct apply {
-    static_assert(Kokkos::Impl::is_memory_traits<MemoryTraits>::value, "");
+    static_assert(Kokkos::is_memory_traits<MemoryTraits>::value, "");
 
     using traits_type =
         Kokkos::ViewTraits<data_type, array_layout,
@@ -3842,24 +3941,21 @@ struct OperatorBoundsErrorOnDevice<MapType, true> {
    this defined by default.
    The existence of this alias indicates the existence of MapType::is_managed
  */
-template <class T, class Enable = void>
-struct has_printable_label_typedef : public std::false_type {};
-
 template <class T>
-struct has_printable_label_typedef<T,
-                                   void_t<typename T::printable_label_typedef>>
-    : public std::true_type {};
+using printable_label_typedef_t = typename T::printable_label_typedef;
 
-template <class MapType>
-KOKKOS_INLINE_FUNCTION void operator_bounds_error_on_device(MapType const&,
-                                                            std::false_type) {
+template <class Map>
+KOKKOS_FUNCTION
+    std::enable_if_t<!is_detected<printable_label_typedef_t, Map>::value>
+    operator_bounds_error_on_device(Map const&) {
   Kokkos::abort("View bounds error");
 }
 
-template <class MapType>
-KOKKOS_INLINE_FUNCTION void operator_bounds_error_on_device(MapType const& map,
-                                                            std::true_type) {
-  OperatorBoundsErrorOnDevice<MapType>::run(map);
+template <class Map>
+KOKKOS_FUNCTION
+    std::enable_if_t<is_detected<printable_label_typedef_t, Map>::value>
+    operator_bounds_error_on_device(Map const& map) {
+  OperatorBoundsErrorOnDevice<Map>::run(map);
 }
 
 #endif  // ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
@@ -3885,8 +3981,7 @@ KOKKOS_INLINE_FUNCTION void view_verify_operator_bounds(
        This check should cover the case of Views that don't
        have the Unmanaged trait but were initialized by pointer. */
     if (tracker.m_tracker.has_record()) {
-      operator_bounds_error_on_device<MapType>(
-          map, has_printable_label_typedef<MapType>());
+      operator_bounds_error_on_device(map);
     } else {
       Kokkos::abort("View bounds error");
     }
diff --git a/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp b/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp
index a5f5406746befc984f17f815e04bac63f0fadff4..d964baa8fb0f5e1b105d244740b74e32d1bdd69e 100644
--- a/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp
+++ b/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp
@@ -62,10 +62,10 @@ void sink(Args&&... args) {
     Kokkos::ImplSYCL::sink(__VA_ARGS__);   \
   } while (0)
 #else
-#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(format, ...)                       \
-  do {                                                                   \
-    static const __attribute__((opencl_constant)) char fmt[] = (format); \
-    sycl::ONEAPI::experimental::printf(fmt, ##__VA_ARGS__);              \
+#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(format, ...)                \
+  do {                                                            \
+    const __attribute__((opencl_constant)) char fmt[] = (format); \
+    sycl::ONEAPI::experimental::printf(fmt, ##__VA_ARGS__);       \
   } while (0)
 #endif
 #endif
diff --git a/packages/kokkos/core/src/traits/Kokkos_ExecutionSpaceTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_ExecutionSpaceTrait.hpp
index 4467b2e03c486d07d80c3fee66e6c3b50c42256e..e12d1f6a49d37f4f595214be00713c2ccb4166ef 100644
--- a/packages/kokkos/core/src/traits/Kokkos_ExecutionSpaceTrait.hpp
+++ b/packages/kokkos/core/src/traits/Kokkos_ExecutionSpaceTrait.hpp
@@ -56,6 +56,11 @@ namespace Impl {
 //==============================================================================
 // <editor-fold desc="trait specification"> {{{1
 
+template <class T>
+struct show_extra_execution_space_erroneously_given_to_execution_policy;
+template <>
+struct show_extra_execution_space_erroneously_given_to_execution_policy<void> {
+};
 struct ExecutionSpaceTrait : TraitSpecificationBase<ExecutionSpaceTrait> {
   struct base_traits {
     static constexpr auto execution_space_is_defaulted = true;
@@ -63,32 +68,30 @@ struct ExecutionSpaceTrait : TraitSpecificationBase<ExecutionSpaceTrait> {
     using execution_space = Kokkos::DefaultExecutionSpace;
   };
   template <class T>
-  using trait_matches_specification = is_execution_space<T>;
-};
-
-// </editor-fold> end trait specification }}}1
-//==============================================================================
+  using trait_matches_specification = Kokkos::is_execution_space<T>;
+  template <class ExecSpace, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
 
-//==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+    static constexpr auto show_execution_space_error_in_compilation_message =
+        show_extra_execution_space_erroneously_given_to_execution_policy<
+            std::conditional_t<base_t::execution_space_is_defaulted, void,
+                               typename base_t::execution_space>>{};
+    static_assert(base_t::execution_space_is_defaulted,
+                  "Kokkos Error: More than one execution space given. Search "
+                  "compiler output for 'show_extra_execution_space' to see the "
+                  "type of the errant tag.");
 
-template <class ExecutionSpace, class... Traits>
-struct AnalyzeExecPolicy<
-    std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value>,
-    ExecutionSpace, Traits...> : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
+    static constexpr auto execution_space_is_defaulted = false;
 
-  static_assert(base_t::execution_space_is_defaulted,
-                "Kokkos Error: More than one execution space given");
-
-  static constexpr bool execution_space_is_defaulted = false;
-
-  using execution_space = ExecutionSpace;
+    using execution_space = ExecSpace;
+  };
 };
 
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+// </editor-fold> end trait specification }}}1
 //==============================================================================
+
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/packages/kokkos/core/src/traits/Kokkos_GraphKernelTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_GraphKernelTrait.hpp
index eb649dc0887a2aab8c88feae8156676b70a7cdf7..b57dfbbc07ccc0e2391b2fdb5b6ec577ed552cc2 100644
--- a/packages/kokkos/core/src/traits/Kokkos_GraphKernelTrait.hpp
+++ b/packages/kokkos/core/src/traits/Kokkos_GraphKernelTrait.hpp
@@ -61,6 +61,12 @@ struct GraphKernelTrait : TraitSpecificationBase<GraphKernelTrait> {
   struct base_traits {
     using is_graph_kernel = std::false_type;
   };
+  template <class, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+    using is_graph_kernel = std::true_type;
+  };
   template <class T>
   using trait_matches_specification = std::is_same<T, IsGraphKernelTag>;
 };
@@ -68,19 +74,6 @@ struct GraphKernelTrait : TraitSpecificationBase<GraphKernelTrait> {
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
-//==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
-
-template <class... Traits>
-struct AnalyzeExecPolicy<void, Impl::IsGraphKernelTag, Traits...>
-    : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-  using is_graph_kernel = std::true_type;
-};
-
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
-//==============================================================================
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/packages/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp
index e15adc17116cb66481f90acc0b9ba5a83ec1ab52..63446375fbd529e82d71acf2bac5ef12fba238af 100644
--- a/packages/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp
+++ b/packages/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp
@@ -46,54 +46,71 @@
 #define KOKKOS_KOKKOS_INDEXTYPETRAIT_HPP
 
 #include <Kokkos_Macros.hpp>
-#include <Kokkos_Concepts.hpp>  // IndexType, is_index_type
+#include <Kokkos_Concepts.hpp>  // IndexType
 #include <traits/Kokkos_PolicyTraitAdaptor.hpp>
 #include <traits/Kokkos_Traits_fwd.hpp>
 
 namespace Kokkos {
 namespace Impl {
 
+template <class Trait, class AnalyzeNextTrait>
+struct IndexTypePolicyMixin;
+
 //==============================================================================
 // <editor-fold desc="trait specification"> {{{1
 
+template <class T>
+struct show_extra_index_type_erroneously_given_to_execution_policy;
+template <>
+struct show_extra_index_type_erroneously_given_to_execution_policy<void> {};
 struct IndexTypeTrait : TraitSpecificationBase<IndexTypeTrait> {
   struct base_traits {
     static constexpr bool index_type_is_defaulted = true;
     using index_type = dependent_policy_trait_default;
   };
-  template <class T>
-  using trait_matches_specification =
-      std::integral_constant<bool, std::is_integral<T>::value ||
-                                       is_index_type<T>::value>;
+  template <class IdxType, class AnalyzeNextTrait>
+  using mixin_matching_trait = IndexTypePolicyMixin<IdxType, AnalyzeNextTrait>;
 };
 
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
 //==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+// <editor-fold desc="IndexTypePolicyMixin specializations"> {{{1
 
 // Index type given as IndexType template
-template <class IntegralIndexType, class... Traits>
-struct AnalyzeExecPolicy<void, Kokkos::IndexType<IntegralIndexType>, Traits...>
-    : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
+template <class IntegralIndexType, class AnalyzeNextTrait>
+struct IndexTypePolicyMixin<Kokkos::IndexType<IntegralIndexType>,
+                            AnalyzeNextTrait> : AnalyzeNextTrait {
+  using base_t = AnalyzeNextTrait;
   using base_t::base_t;
+  static constexpr auto show_index_type_error_in_compilation_message =
+      show_extra_index_type_erroneously_given_to_execution_policy<
+          std::conditional_t<base_t::index_type_is_defaulted, void,
+                             typename base_t::schedule_type>>{};
   static_assert(base_t::index_type_is_defaulted,
-                "Kokkos Error: More than one index type given");
+                "Kokkos Error: More than one index type given. Search "
+                "compiler output for 'show_extra_index_type' to see the "
+                "type of the errant tag.");
   static constexpr bool index_type_is_defaulted = false;
   using index_type = Kokkos::IndexType<IntegralIndexType>;
 };
 
-// IndexType given as an integral type directly
-template <class IntegralIndexType, class... Traits>
-struct AnalyzeExecPolicy<
-    std::enable_if_t<std::is_integral<IntegralIndexType>::value>,
-    IntegralIndexType, Traits...> : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
+// IndexType given as an integral type directly (the matcher already checks
+// this, so we don't have specialize to re-check it here)
+template <class IntegralIndexType, class AnalyzeNextTrait>
+struct IndexTypePolicyMixin : AnalyzeNextTrait {
+  using base_t = AnalyzeNextTrait;
   using base_t::base_t;
+  static constexpr auto show_index_type_error_in_compilation_message =
+      show_extra_index_type_erroneously_given_to_execution_policy<
+          std::conditional_t<base_t::index_type_is_defaulted, void,
+                             typename base_t::schedule_type>>{};
   static_assert(base_t::index_type_is_defaulted,
-                "Kokkos Error: More than one index type given");
+                "Kokkos Error: More than one index type given. Search "
+                "compiler output for 'show_extra_index_type' to see the "
+                "type of the errant tag.");
+  static_assert(std::is_integral<IntegralIndexType>::value, "");
   static constexpr bool index_type_is_defaulted = false;
   using index_type = Kokkos::IndexType<IntegralIndexType>;
 };
@@ -101,6 +118,22 @@ struct AnalyzeExecPolicy<
 // </editor-fold> end AnalyzeExecPolicy specializations }}}1
 //==============================================================================
 
+//==============================================================================
+// <editor-fold desc="PolicyTraitMatcher specialization"> {{{1
+
+template <class IntegralIndexType>
+struct PolicyTraitMatcher<IndexTypeTrait, IndexType<IntegralIndexType>>
+    : std::true_type {};
+
+template <class IntegralIndexType>
+struct PolicyTraitMatcher<
+    IndexTypeTrait, IntegralIndexType,
+    std::enable_if_t<std::is_integral<IntegralIndexType>::value>>
+    : std::true_type {};
+
+// </editor-fold> end PolicyTraitMatcher specialization"> }}}1
+//==============================================================================
+
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/packages/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp
index 30e07039a405d61f2c78217284f9036a0a533f06..b05f3b29e976c503c120c4f59dc4ed81b01822f7 100644
--- a/packages/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp
+++ b/packages/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp
@@ -45,8 +45,11 @@
 #ifndef KOKKOS_KOKKOS_ITERATIONPATTERNTRAIT_HPP
 #define KOKKOS_KOKKOS_ITERATIONPATTERNTRAIT_HPP
 
-#include <Kokkos_Concepts.hpp>  // is_iteration_pattern
-#include <type_traits>          // is_void
+#include <Kokkos_Concepts.hpp>                   // is_iteration_pattern
+#include <traits/Kokkos_PolicyTraitAdaptor.hpp>  // TraitSpecificationBase
+#include <Kokkos_Rank.hpp>                       // Rank
+#include <Kokkos_Layout.hpp>                     // Iterate
+#include <type_traits>                           // is_void
 
 namespace Kokkos {
 namespace Impl {
@@ -54,32 +57,42 @@ namespace Impl {
 //==============================================================================
 // <editor-fold desc="trait specification"> {{{1
 
+template <class T>
+struct show_extra_iteration_pattern_erroneously_given_to_execution_policy;
+template <>
+struct show_extra_iteration_pattern_erroneously_given_to_execution_policy<
+    void> {};
 struct IterationPatternTrait : TraitSpecificationBase<IterationPatternTrait> {
   struct base_traits {
     using iteration_pattern = void;  // TODO set default iteration pattern
   };
-  template <class T>
-  using trait_matches_specification = is_iteration_pattern<T>;
+  template <class IterPattern, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+    static constexpr auto show_iteration_pattern_error_in_compilation_message =
+        show_extra_iteration_pattern_erroneously_given_to_execution_policy<
+            typename base_t::iteration_pattern>{};
+    static_assert(
+        std::is_void<typename base_t::iteration_pattern>::value,
+        "Kokkos Error: More than one index type given. Search "
+        "compiler output for 'show_extra_iteration_pattern' to see the "
+        "type of the errant tag.");
+    using iteration_pattern = IterPattern;
+  };
 };
 
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
 //==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+// <editor-fold desc="PolicyTraitMatcher specialization"> {{{1
 
-template <class IterationPattern, class... Traits>
-struct AnalyzeExecPolicy<
-    std::enable_if_t<is_iteration_pattern<IterationPattern>::value>,
-    IterationPattern, Traits...> : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-  static_assert(std::is_void<typename base_t::iteration_pattern>::value,
-                "Kokkos Error: More than one iteration pattern given");
-  using iteration_pattern = IterationPattern;
-};
+template <unsigned N, Iterate OuterDir, Iterate InnerDir>
+struct PolicyTraitMatcher<IterationPatternTrait, Rank<N, OuterDir, InnerDir>>
+    : std::true_type {};
 
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+// </editor-fold> end  }}}1
 //==============================================================================
 
 }  // end namespace Impl
diff --git a/packages/kokkos/core/src/traits/Kokkos_LaunchBoundsTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_LaunchBoundsTrait.hpp
index 73ae8e27e2eca54412b4cbab464b1760c93d7aed..06836bef8bff6ffc19d470766e0caa0a739a43c2 100644
--- a/packages/kokkos/core/src/traits/Kokkos_LaunchBoundsTrait.hpp
+++ b/packages/kokkos/core/src/traits/Kokkos_LaunchBoundsTrait.hpp
@@ -62,29 +62,33 @@ struct LaunchBoundsTrait : TraitSpecificationBase<LaunchBoundsTrait> {
 
     using launch_bounds = LaunchBounds<>;
   };
-  template <class T>
-  using trait_matches_specification = is_launch_bounds<T>;
+  template <class LaunchBoundParam, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+
+    static constexpr bool launch_bounds_is_defaulted = false;
+
+    static_assert(base_t::launch_bounds_is_defaulted,
+                  "Kokkos Error: More than one launch_bounds given");
+
+    using launch_bounds = LaunchBoundParam;
+  };
 };
 
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
 //==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+// <editor-fold desc="PolicyTraitMatcher specialization"> {{{1
 
-template <unsigned int MaxT, unsigned int MinB, class... Traits>
-struct AnalyzeExecPolicy<void, Kokkos::LaunchBounds<MaxT, MinB>, Traits...>
-    : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-  static_assert(base_t::launch_bounds_is_defaulted,
-                "Kokkos Error: More than one launch_bounds given");
-  static constexpr bool launch_bounds_is_defaulted = false;
-  using launch_bounds = Kokkos::LaunchBounds<MaxT, MinB>;
-};
+template <unsigned int maxT, unsigned int minB>
+struct PolicyTraitMatcher<LaunchBoundsTrait, LaunchBounds<maxT, minB>>
+    : std::true_type {};
 
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+// </editor-fold> end PolicyTraitMatcher specialization }}}1
 //==============================================================================
+
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/packages/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp
index 3deb4a94d54ddeee0a6a0712f107d61674818668..73be14cf8501b3c3bff4a2386e2f75e1ffb00f19 100644
--- a/packages/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp
+++ b/packages/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp
@@ -82,6 +82,9 @@ struct MaximizeOccupancy {
 
 namespace Impl {
 
+template <class Policy, class AnalyzeNextTrait>
+struct OccupancyControlPolicyMixin;
+
 //==============================================================================
 // <editor-fold desc="Occupancy control trait specification"> {{{1
 
@@ -94,6 +97,9 @@ struct OccupancyControlTrait : TraitSpecificationBase<OccupancyControlTrait> {
       return occupancy_control{};
     }
   };
+  template <class OccControl, class AnalyzeNextTrait>
+  using mixin_matching_trait =
+      OccupancyControlPolicyMixin<OccControl, AnalyzeNextTrait>;
   template <class T>
   using trait_matches_specification = std::integral_constant<
       bool,
@@ -105,39 +111,33 @@ struct OccupancyControlTrait : TraitSpecificationBase<OccupancyControlTrait> {
 //==============================================================================
 
 //==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
-
-// The DesiredOccupancy case has runtime storage, so we need to handle copies
-// and assignments
-template <class... Traits>
-struct AnalyzeExecPolicy<void, Kokkos::Experimental::DesiredOccupancy,
-                         Traits...> : AnalyzeExecPolicy<void, Traits...> {
- public:
-  using base_t            = AnalyzeExecPolicy<void, Traits...>;
+// <editor-fold desc="OccupancyControlPolicyMixin specializations"> {{{1
+
+template <class AnalyzeNextTrait>
+struct OccupancyControlPolicyMixin<Kokkos::Experimental::DesiredOccupancy,
+                                   AnalyzeNextTrait> : AnalyzeNextTrait {
+  using base_t            = AnalyzeNextTrait;
   using occupancy_control = Kokkos::Experimental::DesiredOccupancy;
   static constexpr bool experimental_contains_desired_occupancy = true;
 
-  template <class OccControl>
-  using with_occupancy_control = AnalyzeExecPolicy<void, OccControl, Traits...>;
-
   // Treat this as private, but make it public so that MSVC will still treat
   // this as a standard layout class and make it the right size: storage for a
   // stateful desired occupancy
   //   private:
-  occupancy_control m_desired_occupancy;
+  occupancy_control m_desired_occupancy = occupancy_control{};
 
-  AnalyzeExecPolicy() = default;
+  OccupancyControlPolicyMixin() = default;
   // Converting constructor
   // Just rely on the convertibility of occupancy_control to transfer the data
   template <class Other>
-  AnalyzeExecPolicy(ExecPolicyTraitsWithDefaults<Other> const& other)
+  OccupancyControlPolicyMixin(ExecPolicyTraitsWithDefaults<Other> const& other)
       : base_t(other),
         m_desired_occupancy(other.impl_get_occupancy_control()) {}
 
   // Converting assignment operator
   // Just rely on the convertibility of occupancy_control to transfer the data
   template <class Other>
-  AnalyzeExecPolicy& operator=(
+  OccupancyControlPolicyMixin& operator=(
       ExecPolicyTraitsWithDefaults<Other> const& other) {
     *static_cast<base_t*>(this) = other;
     this->impl_set_desired_occupancy(
@@ -160,16 +160,16 @@ struct AnalyzeExecPolicy<void, Kokkos::Experimental::DesiredOccupancy,
   }
 };
 
-template <class... Traits>
-struct AnalyzeExecPolicy<void, Kokkos::Experimental::MaximizeOccupancy,
-                         Traits...> : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
+template <class AnalyzeNextTrait>
+struct OccupancyControlPolicyMixin<Kokkos::Experimental::MaximizeOccupancy,
+                                   AnalyzeNextTrait> : AnalyzeNextTrait {
+  using base_t = AnalyzeNextTrait;
   using base_t::base_t;
   using occupancy_control = Kokkos::Experimental::MaximizeOccupancy;
   static constexpr bool experimental_contains_desired_occupancy = false;
 };
 
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+// </editor-fold> end OccupancyControlPolicyMixin specializations }}}1
 //==============================================================================
 
 }  // end namespace Impl
diff --git a/packages/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp b/packages/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp
index b087dac85559bd6dc67c983bdaad1a6675cfde9b..e500dd4e831abaa03479e9fb1e2fb67595107a9e 100644
--- a/packages/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp
+++ b/packages/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp
@@ -73,7 +73,7 @@ namespace Impl {
 // something that we can default to in the unspecialized case, just like we
 // do for AnalyzeExecPolicy
 template <class TraitSpec, class Trait, class Enable = void>
-struct PolicyTraitMatcher;
+struct PolicyTraitMatcher : std::false_type {};
 
 template <class TraitSpec, class Trait>
 struct PolicyTraitMatcher<
diff --git a/packages/kokkos/core/src/traits/Kokkos_PolicyTraitMatcher.hpp b/packages/kokkos/core/src/traits/Kokkos_PolicyTraitMatcher.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..31927320bf6fe77cc133e49a80e7f741574165a8
--- /dev/null
+++ b/packages/kokkos/core/src/traits/Kokkos_PolicyTraitMatcher.hpp
@@ -0,0 +1,77 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <impl/Kokkos_Utilities.hpp>  // type_list
+
+#include <traits/Kokkos_Traits_fwd.hpp>
+
+#ifndef KOKKOS_KOKKOS_POLICYTRAITMATCHER_HPP
+#define KOKKOS_KOKKOS_POLICYTRAITMATCHER_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+//==============================================================================
+// <editor-fold desc="PolicyTraitMatcher"> {{{1
+
+// To handle the WorkTag case, we need more than just a predicate; we need
+// something that we can default to in the unspecialized case, just like we
+// do for AnalyzeExecPolicy
+template <class TraitSpec, class Trait, class Enable = void>
+struct PolicyTraitMatcher : std::false_type {};
+
+template <class TraitSpec, class Trait>
+struct PolicyTraitMatcher<
+    TraitSpec, Trait,
+    std::enable_if_t<
+        TraitSpec::template trait_matches_specification<Trait>::value>>
+    : std::true_type {};
+
+// </editor-fold> end PolicyTraitMatcher }}}1
+//==============================================================================
+
+}  // end namespace Impl
+}  // end namespace Kokkos
+
+#endif  // KOKKOS_KOKKOS_POLICYTRAITMATCHER_HPP
diff --git a/packages/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp
index 74bab6fce2a632269a804971af3e50348e34c8b2..3e578f9060ab22b8707adc8797d401226a52ff44 100644
--- a/packages/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp
+++ b/packages/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp
@@ -57,34 +57,43 @@ namespace Impl {
 //==============================================================================
 // <editor-fold desc="trait specification"> {{{1
 
+template <class T>
+struct show_extra_schedule_type_erroneously_given_to_execution_policy;
+template <>
+struct show_extra_schedule_type_erroneously_given_to_execution_policy<void> {};
 struct ScheduleTrait : TraitSpecificationBase<ScheduleTrait> {
   struct base_traits {
     static constexpr auto schedule_type_is_defaulted = true;
 
     using schedule_type = Schedule<Static>;
   };
-  template <class T>
-  using trait_matches_specification = is_schedule_type<T>;
+  template <class Sched, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+    using schedule_type = Sched;
+    static constexpr auto show_schedule_type_error_in_compilation_message =
+        show_extra_schedule_type_erroneously_given_to_execution_policy<
+            std::conditional_t<base_t::schedule_type_is_defaulted, void,
+                               typename base_t::schedule_type>>{};
+    static_assert(base_t::schedule_type_is_defaulted,
+                  "Kokkos Error: More than one schedule type given. Search "
+                  "compiler output for 'show_extra_schedule_type' to see the "
+                  "type of the errant tag.");
+    static constexpr bool schedule_type_is_defaulted = false;
+  };
 };
 
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
 //==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
-
-template <class ScheduleType, class... Traits>
-struct AnalyzeExecPolicy<void, Kokkos::Schedule<ScheduleType>, Traits...>
-    : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-  static_assert(base_t::schedule_type_is_defaulted,
-                "Kokkos Error: More than one schedule type given");
-  static constexpr bool schedule_type_is_defaulted = false;
-  using schedule_type = Kokkos::Schedule<ScheduleType>;
-};
+// <editor-fold desc="PolicyTraitMatcher specialization"> {{{1
+
+template <class Sched>
+struct PolicyTraitMatcher<ScheduleTrait, Schedule<Sched>> : std::true_type {};
 
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+// </editor-fold> end PolicyTraitMatcher specialization }}}1
 //==============================================================================
 
 }  // end namespace Impl
diff --git a/packages/kokkos/core/src/traits/Kokkos_Traits_fwd.hpp b/packages/kokkos/core/src/traits/Kokkos_Traits_fwd.hpp
index b8b9a0ca2d889b08116528803d0c1b096060ecad..b8289ca6188846884277ca514db453145f1cb3c6 100644
--- a/packages/kokkos/core/src/traits/Kokkos_Traits_fwd.hpp
+++ b/packages/kokkos/core/src/traits/Kokkos_Traits_fwd.hpp
@@ -51,9 +51,15 @@ namespace Impl {
 template <class Enable, class... TraitsList>
 struct AnalyzeExecPolicy;
 
+template <class Enable, class TraitSpecList, class... Traits>
+struct AnalyzeExecPolicyUseMatcher;
+
 template <class AnalysisResults>
 struct ExecPolicyTraitsWithDefaults;
 
+template <class TraitSpec, class Trait, class Enable>
+struct PolicyTraitMatcher;
+
 template <class TraitSpec, template <class...> class PolicyTemplate,
           class AlreadyProcessedList, class ToProcessList, class NewTrait,
           class Enable = void>
@@ -67,6 +73,40 @@ struct PolicyTraitAdaptor;
 // traits
 struct dependent_policy_trait_default;
 
+//==============================================================================
+// <editor-fold desc="Execution policy trait specifications"> {{{1
+
+struct ExecutionSpaceTrait;
+struct IndexTypeTrait;
+struct ScheduleTrait;
+struct IterationPatternTrait;
+struct WorkItemPropertyTrait;
+struct LaunchBoundsTrait;
+struct OccupancyControlTrait;
+struct GraphKernelTrait;
+struct WorkTagTrait;
+
+// Keep these sorted by frequency of use to reduce compilation time
+//
+// clang-format off
+using execution_policy_trait_specifications =
+  type_list<
+    ExecutionSpaceTrait,
+    IndexTypeTrait,
+    ScheduleTrait,
+    IterationPatternTrait,
+    WorkItemPropertyTrait,
+    LaunchBoundsTrait,
+    OccupancyControlTrait,
+    GraphKernelTrait,
+    // This one has to be last, unfortunately:
+    WorkTagTrait
+  >;
+// clang-format on
+
+// </editor-fold> end Execution policy trait specifications }}}1
+//==============================================================================
+
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/packages/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp
index 2656316fb934333655d0370f4dc3d40eea7bbb86..35671d19b02bb72c777b77717beced94d152beb3 100644
--- a/packages/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp
+++ b/packages/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp
@@ -60,6 +60,12 @@ struct WorkItemPropertyTrait : TraitSpecificationBase<WorkItemPropertyTrait> {
   struct base_traits {
     using work_item_property = Kokkos::Experimental::WorkItemProperty::None_t;
   };
+  template <class WorkItemProp, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+    using work_item_property = WorkItemProp;
+  };
   template <class T>
   using trait_matches_specification =
       Kokkos::Experimental::is_work_item_property<T>;
@@ -68,26 +74,6 @@ struct WorkItemPropertyTrait : TraitSpecificationBase<WorkItemPropertyTrait> {
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
-//==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
-
-template <class Property, class... Traits>
-struct AnalyzeExecPolicy<
-    std::enable_if_t<
-        Kokkos::Experimental::is_work_item_property<Property>::value>,
-    Property, Traits...> : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-  static_assert(
-      std::is_same<typename base_t::work_item_property,
-                   Kokkos::Experimental::WorkItemProperty::None_t>::value,
-      "Kokkos Error: More than one work item property given");
-  using work_item_property = Property;
-};
-
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
-//==============================================================================
-
 }  // end namespace Impl
 
 namespace Experimental {
diff --git a/packages/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp
index 877005756a703b067c07c6f57c3fc4212f7484ca..424e5c405b70cff9f73ef5756b5dca41e9d3d618 100644
--- a/packages/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp
+++ b/packages/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp
@@ -49,6 +49,7 @@
 #include <Kokkos_Concepts.hpp>  // is_execution_space
 #include <traits/Kokkos_PolicyTraitAdaptor.hpp>
 #include <traits/Kokkos_Traits_fwd.hpp>
+#include <impl/Kokkos_Utilities.hpp>  // type_list_any, type_list_remove_first
 
 namespace Kokkos {
 namespace Impl {
@@ -56,68 +57,65 @@ namespace Impl {
 //==============================================================================
 // <editor-fold desc="trait specification"> {{{1
 
+template <class T>
+struct show_extra_work_tag_erroneously_given_to_execution_policy;
+template <>
+struct show_extra_work_tag_erroneously_given_to_execution_policy<void> {};
+
+using _exec_policy_traits_without_work_tag = typename type_list_remove_first<
+    WorkTagTrait, execution_policy_trait_specifications>::type;
+
+template <class Trait>
+struct _trait_matches_spec_predicate {
+  template <class TraitSpec>
+  struct apply {
+    using type = typename PolicyTraitMatcher<TraitSpec, Trait>::type;
+    static constexpr bool value = type::value;
+  };
+};
+
 struct WorkTagTrait : TraitSpecificationBase<WorkTagTrait> {
   struct base_traits {
     using work_tag = void;
   };
+  template <class WorkTag, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+    using work_tag = WorkTag;
+    static constexpr auto show_work_tag_error_in_compilation_message =
+        show_extra_work_tag_erroneously_given_to_execution_policy<
+            typename base_t::work_tag>{};
+    static_assert(
+        std::is_void<typename base_t::work_tag>::value,
+        "Kokkos Error: More than one work tag given. Search compiler output "
+        "for 'show_extra_work_tag' to see the type of the errant tag.");
+  };
+  // Since we don't have subsumption in pre-C++20, we need to have the work tag
+  // "trait" handling code ensure that none of the other conditions are met.
+  // * Compile time cost complexity note: at first glance it looks like this
+  //   "rechecks" all of the other trait specs when used in the context of the
+  //   full list of execution policy traits, but actually since we've already
+  //   checked all of them to get to the end of the list, the compiler will
+  //   have already generated those definitions, so there should be little extra
+  //   cost to this. However, in the scenario where we use work tag in isolation
+  //   (like if we were to add a `require()`-like thing that changes the work
+  //   tag of an existing execution policy instance), we need to check all of
+  //   the other traits to make sure that we're not replacing something else,
+  //   given that the concept of a work tag is basically unconstrained and could
+  //   be anything.  This should still be as efficient at compile time as the
+  //   old code that just did a big long series of nested std::conditionals, but
+  //   we should benchmark this assumption if it becomes a problem.
+  template <class T>
+  using trait_matches_specification = std::integral_constant<
+      bool, !std::is_void<T>::value &&
+                !type_list_any<_trait_matches_spec_predicate<T>::template apply,
+                               _exec_policy_traits_without_work_tag>::value>;
 };
 
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
-//==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
-
-// Since we don't have subsumption in pre-C++20, we need to have the work tag
-// "trait" handling code be unspecialized, so we handle it instead in a class
-// with a different name.
-template <class... Traits>
-struct AnalyzeExecPolicyHandleWorkTag : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-};
-
-template <class WorkTag, class... Traits>
-struct AnalyzeExecPolicyHandleWorkTag<WorkTag, Traits...>
-    : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-  static_assert(std::is_void<typename base_t::work_tag>::value,
-                "Kokkos Error: More than one work tag given");
-  using work_tag = WorkTag;
-};
-
-// This only works if this is not a partial specialization, so we have to
-// do the partial specialization elsewhere
-template <class Enable, class... Traits>
-struct AnalyzeExecPolicy : AnalyzeExecPolicyHandleWorkTag<Traits...> {
-  using base_t = AnalyzeExecPolicyHandleWorkTag<Traits...>;
-  using base_t::base_t;
-};
-
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
-//==============================================================================
-
-//==============================================================================
-// <editor-fold desc="PolicyTraitMatcher specializations"> {{{1
-
-// In order to match the work tag trait the work tag "matcher" needs to be
-// unspecialized and the logic needs to be handled in a differently-named class,
-// just like above.
-template <class TraitSpec, class Trait>
-struct PolicyTraitMatcherHandleWorkTag : std::false_type {};
-
-template <class Trait>
-struct PolicyTraitMatcherHandleWorkTag<WorkTagTrait, Trait>
-    : std::integral_constant<bool, !std::is_void<Trait>::value> {};
-
-template <class TraitSpec, class Trait, class Enable>
-struct PolicyTraitMatcher /* unspecialized! */
-    : PolicyTraitMatcherHandleWorkTag<TraitSpec, Trait> {};
-
-// </editor-fold> end PolicyTraitMatcher specializations }}}1
-//==============================================================================
-
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/packages/kokkos/core/unit_test/CMakeLists.txt b/packages/kokkos/core/unit_test/CMakeLists.txt
index 5826208851090933ee296988287a6a633eb2c476..89b8ff1e4f0a8004ecd4c2f06d72544123107d03 100644
--- a/packages/kokkos/core/unit_test/CMakeLists.txt
+++ b/packages/kokkos/core/unit_test/CMakeLists.txt
@@ -41,10 +41,10 @@ SET(KOKKOS_OPENMP_FEATURE_LEVEL 999)
 SET(KOKKOS_OPENMP_NAME OpenMP)
 
 # FIXME_OPENMPTARGET - The NVIDIA HPC compiler nvc++ only compiles the first 8 incremental tests for the OpenMPTarget backend.
-IF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
-  SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 8)
+IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
+  SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 10)
 ELSE()
-  SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 13)
+  SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 14)
 ENDIF()
 
 SET(KOKKOS_OPENMPTARGET_NAME Experimental::OpenMPTarget)
@@ -65,6 +65,21 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
 KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
 KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files)
 
+SET(COMPILE_ONLY_SOURCES
+  TestDetectionIdiom.cpp
+  TestInterOp.cpp
+  TestTypeList.cpp
+)
+# TestInterOp has a dependency on containers
+IF(KOKKOS_HAS_TRILINOS)
+  LIST(REMOVE_ITEM COMPILE_ONLY_SOURCES TestInterOp.cpp)
+ENDIF()
+KOKKOS_ADD_EXECUTABLE(
+  TestCompileOnly
+  SOURCES
+  ${COMPILE_ONLY_SOURCES}
+)
+
 foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
   # Because there is always an exception to the rule
   if(Tag STREQUAL "Threads")
@@ -98,6 +113,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
         Complex
         Crs
         DeepCopyAlignment
+        ExecutionSpace
         FunctorAnalysis
         Init
         LocalDeepCopy
@@ -107,6 +123,9 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
         MDRange_c
         HostSharedPtr
         HostSharedPtrAccessOnDevice
+        QuadPrecisionMath
+        ExecSpacePartitioning
+        MathematicalSpecialFunctions
         )
       set(file ${dir}/Test${Tag}_${Name}.cpp)
       # Write to a temporary intermediate file and call configure_file to avoid
@@ -190,7 +209,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
     elseif(Tag STREQUAL "HIP")
       set(TagHostAccessible HIPHostPinned)
     elseif(Tag STREQUAL "SYCL")
-      set(TagHostAccessible SYCLSharedUSMSpace)
+      set(TagHostAccessible SYCLSharedUSM)
     endif()
 
     set(${Tag}_SOURCES2B)
@@ -257,6 +276,43 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
   endif()
 endforeach()
 
+foreach(PairDeviceSpace HIP-HostPinned;Cuda-HostPinned;Cuda-UVM;SYCL-HostUSM;SYCL-SharedUSM)
+  string(REGEX REPLACE "([^-]*)-(.*)" "\\1" DEVICE ${PairDeviceSpace})
+  string(REGEX REPLACE "([^-]*)-(.*)" "\\2" SPACE ${PairDeviceSpace})
+
+  string(TOUPPER ${DEVICE} UPPER_DEVICE)
+  string(TOLOWER ${DEVICE} dir)
+
+  if(Kokkos_ENABLE_${UPPER_DEVICE})
+    set(dir ${CMAKE_CURRENT_BINARY_DIR}/${dir})
+    file(MAKE_DIRECTORY ${dir})
+    foreach(Name
+      SharedAlloc
+      ViewAPI_a
+      ViewAPI_b
+      ViewAPI_c
+      ViewAPI_d
+      ViewAPI_e
+      ViewCopy_a
+      ViewCopy_b
+      ViewMapping_a
+      ViewMapping_b
+      ViewMapping_subview
+      )
+      set(file ${dir}/Test${DEVICE}${SPACE}_${Name}.cpp)
+      # Write to a temporary intermediate file and call configure_file to avoid
+      # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs.
+      file(WRITE ${dir}/dummy.cpp
+          "#include <Test${DEVICE}${SPACE}_Category.hpp>\n"
+          "#include <Test${Name}.hpp>\n"
+      )
+      configure_file(${dir}/dummy.cpp ${file})
+      list(APPEND ${DEVICE}_SOURCES3 ${file})
+    endforeach()
+    list(APPEND ${DEVICE}_SOURCES ${${DEVICE}_SOURCES3})
+  endif()
+endforeach()
+
 if(Kokkos_ENABLE_OPENMPTARGET)
   list(REMOVE_ITEM OpenMPTarget_SOURCES
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_complexfloat.cpp
@@ -264,9 +320,7 @@ if(Kokkos_ENABLE_OPENMPTARGET)
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Crs.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_LocalDeepCopy.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Other.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reductions_DeviceView.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamReductionScan.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamTeamSize.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamScan.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_e.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewCopy_a.cpp
@@ -278,9 +332,16 @@ if(Kokkos_ENABLE_OPENMPTARGET)
 endif()
 
 # FIXME_OPENMPTARGET - Comment non-passing tests with the NVIDIA HPC compiler nvc++
-IF(KOKKOS_ENABLE_OPENMPTARGET
-   AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
+IF(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
   list(REMOVE_ITEM OpenMPTarget_SOURCES
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_int64_t_reduce.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_int64_t_reduce_dynamic.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_int64_t_reduce_dynamic_view.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_double_reduce.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_double_reduce_dynamic.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamTeamSize.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reductions_DeviceView.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_UniqueToken.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_HostSharedPtr.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_HostSharedPtrAccessOnDevice.cpp
@@ -370,14 +431,19 @@ if(Kokkos_ENABLE_PTHREAD)
   )
 endif()
 
-if(Kokkos_ENABLE_OPENMP)
+if (Kokkos_ENABLE_OPENMP)
+  set(OpenMP_EXTRA_SOURCES
+    openmp/TestOpenMP_Task.cpp
+  )
+  if (Kokkos_ENABLE_DEPRECATED_CODE_3)
+    list(APPEND OpenMP_EXTRA_SOURCES openmp/TestOpenMP_Task.cpp)
+  endif ()
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
     UnitTest_OpenMP
     SOURCES
     UnitTestMainInit.cpp
     ${OpenMP_SOURCES}
-    openmp/TestOpenMP_PartitionMaster.cpp
-    openmp/TestOpenMP_Task.cpp
+    ${OpenMP_EXTRA_SOURCES}
   )
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
     UnitTest_OpenMPInterOp
@@ -463,28 +529,7 @@ if(Kokkos_ENABLE_CUDA)
       UnitTestMainInit.cpp
       cuda/TestCuda_Task.cpp
       cuda/TestCuda_TeamScratchStreams.cpp
-      cuda/TestCudaHostPinned_SharedAlloc.cpp
-      cuda/TestCudaHostPinned_ViewAPI_a.cpp
-      cuda/TestCudaHostPinned_ViewAPI_b.cpp
-      cuda/TestCudaHostPinned_ViewAPI_c.cpp
-      cuda/TestCudaHostPinned_ViewAPI_d.cpp
-      cuda/TestCudaHostPinned_ViewAPI_e.cpp
-      cuda/TestCudaHostPinned_ViewCopy_a.cpp
-      cuda/TestCudaHostPinned_ViewCopy_b.cpp
-      cuda/TestCudaHostPinned_ViewMapping_a.cpp
-      cuda/TestCudaHostPinned_ViewMapping_b.cpp
-      cuda/TestCudaHostPinned_ViewMapping_subview.cpp
-      cuda/TestCudaUVM_SharedAlloc.cpp
-      cuda/TestCudaUVM_ViewAPI_a.cpp
-      cuda/TestCudaUVM_ViewAPI_b.cpp
-      cuda/TestCudaUVM_ViewAPI_c.cpp
-      cuda/TestCudaUVM_ViewAPI_d.cpp
-      cuda/TestCudaUVM_ViewAPI_e.cpp
-      cuda/TestCudaUVM_ViewCopy_a.cpp
-      cuda/TestCudaUVM_ViewCopy_b.cpp
-      cuda/TestCudaUVM_ViewMapping_a.cpp
-      cuda/TestCudaUVM_ViewMapping_b.cpp
-      cuda/TestCudaUVM_ViewMapping_subview.cpp
+      ${Cuda_SOURCES3}
       cuda/TestCuda_Spaces.cpp
   )
 
@@ -524,17 +569,8 @@ if(Kokkos_ENABLE_HIP)
       ${HIP_SOURCES}
       hip/TestHIP_ScanUnit.cpp
       hip/TestHIP_TeamScratchStreams.cpp
-      hip/TestHIPHostPinned_ViewAPI_a.cpp
-      hip/TestHIPHostPinned_ViewAPI_b.cpp
-      hip/TestHIPHostPinned_ViewAPI_c.cpp
-      hip/TestHIPHostPinned_ViewAPI_d.cpp
-      hip/TestHIPHostPinned_ViewAPI_e.cpp
-      hip/TestHIPHostPinned_ViewCopy_a.cpp
-      hip/TestHIPHostPinned_ViewCopy_b.cpp
-      hip/TestHIPHostPinned_ViewMapping_a.cpp
-      hip/TestHIPHostPinned_ViewMapping_b.cpp
-      hip/TestHIPHostPinned_ViewMapping_subview.cpp
       hip/TestHIP_AsyncLauncher.cpp
+      hip/TestHIP_BlocksizeDeduction.cpp
   )
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
     UnitTest_HIPInterOpInit
@@ -595,13 +631,25 @@ if(Kokkos_ENABLE_SYCL)
       ${SYCL_SOURCES2C}
   )
 
- KOKKOS_ADD_EXECUTABLE_AND_TEST(
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
     UnitTest_SYCL2D
     SOURCES
       UnitTestMainInit.cpp
       ${SYCL_SOURCES2D}
   )
- KOKKOS_ADD_EXECUTABLE_AND_TEST(
+
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_SYCL3
+    SOURCES
+      UnitTestMainInit.cpp
+      # FIXME_SYCL
+      sycl/TestSYCL_Task.cpp
+      sycl/TestSYCL_TeamScratchStreams.cpp
+      ${SYCL_SOURCES3}
+      sycl/TestSYCL_Spaces.cpp
+  )
+
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
     UnitTest_SYCLInterOpInit
     SOURCES
       UnitTestMain.cpp
@@ -622,8 +670,7 @@ if(Kokkos_ENABLE_SYCL)
 endif()
 
 # FIXME_OPENMPTARGET - Comment non-passing tests with the NVIDIA HPC compiler nvc++
-if (KOKKOS_ENABLE_OPENMPTARGET
-    AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
+if (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
   SET(DEFAULT_DEVICE_SOURCES
     UnitTestMainInit.cpp
     default/TestDefaultDeviceType.cpp
@@ -685,11 +732,21 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
 )
 
   if(KOKKOS_ENABLE_TUNING)
+    KOKKOS_ADD_EXECUTABLE_AND_TEST(
+      UnitTest_TuningBuiltins
+      SOURCES
+      tools/TestBuiltinTuners.cpp
+    )
     KOKKOS_ADD_EXECUTABLE_AND_TEST(
       UnitTest_TuningBasics
       SOURCES
         tools/TestTuning.cpp
     )
+    KOKKOS_ADD_EXECUTABLE_AND_TEST(
+      UnitTest_CategoricalTuner
+      SOURCES
+      tools/TestCategoricalTuner.cpp
+    )
   endif()
   if(NOT Kokkos_ENABLE_OPENMPTARGET)
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
@@ -698,6 +755,11 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
       tools/TestLogicalSpaces.cpp
   )
   endif()
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_EventCorrectness
+    SOURCES
+    tools/TestEventCorrectness.cpp
+  )
   if(KOKKOS_ENABLE_LIBDL)
 
     KOKKOS_ADD_TEST_LIBRARY(
@@ -745,7 +807,7 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
       EXE  ProfilingAllCalls
       TOOL kokkosprinter-tool
       ARGS --kokkos-tools-args="-c test delimit"
-      PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source]:0:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination]:0:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::"
+      PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::"
     )
 
     # Above will test that leading/trailing quotes are stripped bc ctest cmd args is:
@@ -762,7 +824,7 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
       EXE  ProfilingAllCalls
       ARGS [=[--kokkos-tools-args=-c test delimit]=]
             --kokkos-tools-library=$<TARGET_FILE:kokkosprinter-tool>
-      PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source]:0:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination]:0:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::"
+      PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::"
     )
   endif() #KOKKOS_ENABLE_LIBDL
 if(NOT KOKKOS_HAS_TRILINOS)
diff --git a/packages/kokkos/core/unit_test/Makefile b/packages/kokkos/core/unit_test/Makefile
index 390fc79a4755e46cbd61b28ee54d44814fa501d9..422628221402586ec4829ad5d8b628cbdd3736b1 100644
--- a/packages/kokkos/core/unit_test/Makefile
+++ b/packages/kokkos/core/unit_test/Makefile
@@ -73,6 +73,8 @@ tmp := $(foreach device, $(KOKKOS_DEVICELIST), \
   ) \
 )
 
+GPU_SPACE_TESTS = SharedAlloc ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewMapping_a ViewMapping_b ViewMapping_subview
+
 SUBVIEW_TESTS = SubView_a SubView_b SubView_c01 SubView_c02 SubView_c03 SubView_c04 SubView_c05 SubView_c06 SubView_c07 SubView_c08 SubView_c09 SubView_c10 SubView_c11 SubView_c12 SubView_c13
 
 KOKKOS_SUBVIEW_DEVICELIST := $(filter-out Cuda, $(KOKKOS_DEVICELIST))
@@ -94,6 +96,16 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
       )\
     )
 
+    GPU_SPACES = CudaHostPinned CudaUVM
+    tmp := $(foreach space, $(GPU_SPACES), \
+      tmp2 := $(foreach test, $(GPU_SPACE_TESTS), \
+        $(if $(filter Test$(space)_$(test).cpp, $(shell ls Test$(space)_$(test).cpp 2>/dev/null)),,\
+          $(shell echo "\#include <Test$(space)_Category.hpp>" > Test$(space)_$(test).cpp); \
+          $(shell echo "\#include <Test"$(test)".hpp>" >> Test$(space)_$(test).cpp); \
+        )\
+      )\
+    )
+
     OBJ_CUDA = UnitTestMainInit.o gtest-all.o
     OBJ_CUDA += TestCuda_Init.o
     OBJ_CUDA += TestCuda_SharedAlloc.o TestCudaUVM_SharedAlloc.o TestCudaHostPinned_SharedAlloc.o
@@ -261,6 +273,16 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
+	GPU_SPACES = HIPHostPinned
+	tmp := $(foreach space, $(GPU_SPACES), \
+	  tmp2 := $(foreach test, $(GPU_SPACE_TESTS), \
+	    $(if $(filter Test$(space)_$(test).cpp, $(shell ls Test$(space)_$(test).cpp 2>/dev/null)),,\
+	      $(shell echo "\#include <Test$(space)_Category.hpp>" > Test$(space)_$(test).cpp); \
+	      $(shell echo "\#include <Test"$(test)".hpp>" >> Test$(space)_$(test).cpp); \
+	    )\
+	  )\
+	)
+
 	OBJ_HIP = UnitTestMainInit.o gtest-all.o
 	OBJ_HIP += TestHIP_Init.o
 	OBJ_HIP += TestHIP_Reducers_a.o TestHIP_Reducers_b.o TestHIP_Reducers_c.o TestHIP_Reducers_d.o
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations.hpp
index 04362125c0648e679f9a1cfb9886ccb84e6b14d5..257ad2e9e5bba73babacd0153ba74f0ab1a2ba15 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations.hpp
@@ -81,6 +81,56 @@ struct InitFunctor {
   InitFunctor(T _init_value) : init_value(_init_value) {}
 };
 
+//---------------------------------------------------
+//--------------atomic_load/store/assign---------------------
+//---------------------------------------------------
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+template <class T, class DEVICE_TYPE>
+struct LoadStoreFunctor {
+  using execution_space = DEVICE_TYPE;
+  using type            = Kokkos::View<T, execution_space>;
+
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    T old = Kokkos::atomic_load(&data());
+    if (old != i0)
+      Kokkos::abort("Kokkos Atomic Load didn't get the right value");
+    Kokkos::atomic_store(&data(), i1);
+    Kokkos::atomic_assign(&data(), old);
+  }
+  LoadStoreFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
+};
+#endif
+
+template <class T, class DeviceType>
+bool LoadStoreAtomicTest(T i0, T i1) {
+  using execution_space = typename DeviceType::execution_space;
+  struct InitFunctor<T, execution_space> f_init(i0);
+  typename InitFunctor<T, execution_space>::type data("Data");
+  typename InitFunctor<T, execution_space>::h_type h_data("HData");
+
+  f_init.data = data;
+  Kokkos::parallel_for(1, f_init);
+  execution_space().fence();
+
+#ifdef KOKKOS_ENABLE_DESUL_ATOMICS
+  struct LoadStoreFunctor<T, execution_space> f(i0, i1);
+
+  f.data = data;
+  Kokkos::parallel_for(1, f);
+#else
+  h_data() = i1;
+#endif
+
+  Kokkos::deep_copy(h_data, data);
+
+  return h_data() == i0;
+}
+
 //---------------------------------------------------
 //--------------atomic_fetch_max---------------------
 //---------------------------------------------------
@@ -594,7 +644,10 @@ struct AndFunctor {
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const { Kokkos::atomic_fetch_and(&data(), (T)i1); }
+  void operator()(int) const {
+    T result = Kokkos::atomic_fetch_and(&data(), (T)i1);
+    Kokkos::atomic_and(&data(), result);
+  }
 
   AndFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
 };
@@ -665,7 +718,10 @@ struct OrFunctor {
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const { Kokkos::atomic_fetch_or(&data(), (T)i1); }
+  void operator()(int) const {
+    T result = Kokkos::atomic_fetch_or(&data(), (T)i1);
+    Kokkos::atomic_or(&data(), result);
+  }
 
   OrFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
 };
@@ -954,6 +1010,7 @@ bool AtomicOperationsTestIntegralType(int i0, int i1, int test) {
     case 10: return RShiftAtomicTest<T, DeviceType>((T)i0, (T)i1);
     case 11: return IncAtomicTest<T, DeviceType>((T)i0);
     case 12: return DecAtomicTest<T, DeviceType>((T)i0);
+    case 13: return LoadStoreAtomicTest<T, DeviceType>((T)i0, (T)i1);
   }
 
   return 0;
@@ -966,6 +1023,7 @@ bool AtomicOperationsTestNonIntegralType(int i0, int i1, int test) {
     case 2: return MinAtomicTest<T, DeviceType>((T)i0, (T)i1);
     case 3: return MulAtomicTest<T, DeviceType>((T)i0, (T)i1);
     case 4: return DivAtomicTest<T, DeviceType>((T)i0, (T)i1);
+    case 5: return LoadStoreAtomicTest<T, DeviceType>((T)i0, (T)i1);
   }
 
   return 0;
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_double.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_double.hpp
index ba9937e1c6643bfd8a4decde2c7823061b0fcbe4..303f5b6eb9f77c3767056ac2639122d4b01b9d7a 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations_double.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations_double.hpp
@@ -57,6 +57,8 @@ TEST(TEST_CATEGORY, atomic_operations_double) {
                  double, TEST_EXECSPACE>(start, end - i, 3)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
                  double, TEST_EXECSPACE>(start, end - i, 4)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
+                 double, TEST_EXECSPACE>(start, end - i, 5)));
   }
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_float.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_float.hpp
index aa56b5ff10e770d2964d498f99e25611d85311c6..d3d4916b4ea6d623b010834627f41ebf65161ff7 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations_float.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations_float.hpp
@@ -57,6 +57,8 @@ TEST(TEST_CATEGORY, atomic_operations_float) {
                  float, TEST_EXECSPACE>(start, end - i, 3)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
                  float, TEST_EXECSPACE>(start, end - i, 4)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
+                 float, TEST_EXECSPACE>(start, end - i, 5)));
   }
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_int.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_int.hpp
index f828be6223c7b4e7554252fd04927ce1a3fcb69a..e5f2f334fc2b07e24ed5f77d75a64947c2117e21 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations_int.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations_int.hpp
@@ -71,6 +71,8 @@ TEST(TEST_CATEGORY, atomic_operations_int) {
                  int, TEST_EXECSPACE>(start, end - i, 11)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
                  int, TEST_EXECSPACE>(start, end - i, 12)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                 int, TEST_EXECSPACE>(start, end - i, 13)));
   }
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_longint.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_longint.hpp
index eee44c9571cf890b25a9a1b9bb32edd279d3cae7..d4fda70e80cff156aa58bc0294c66be38729f745 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations_longint.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations_longint.hpp
@@ -71,6 +71,8 @@ TEST(TEST_CATEGORY, atomic_operations_long) {
                  long int, TEST_EXECSPACE>(start, end - i, 11)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
                  long int, TEST_EXECSPACE>(start, end - i, 12)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                 long int, TEST_EXECSPACE>(start, end - i, 13)));
   }
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp
index 73d4a61d7291f852d1ff2d6607d36a1d0bb2f829..b7fb0cdae5f99f6704dbdd18c2554bcb6b3e5ddf 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp
@@ -71,6 +71,8 @@ TEST(TEST_CATEGORY, atomic_operations_longlong) {
                  long long int, TEST_EXECSPACE>(start, end - i, 11)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
                  long long int, TEST_EXECSPACE>(start, end - i, 12)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                 long long int, TEST_EXECSPACE>(start, end - i, 13)));
   }
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp
index 02f337c57c64633d62d8111c28ae49cee05e80e3..c3c6bc9fb38d9dc9af37bc69c29e60d1fd040cc6 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp
@@ -71,6 +71,8 @@ TEST(TEST_CATEGORY, atomic_operations_unsigned) {
                  unsigned int, TEST_EXECSPACE>(start, end - i, 11)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
                  unsigned int, TEST_EXECSPACE>(start, end - i, 12)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                 unsigned int, TEST_EXECSPACE>(start, end - i, 13)));
   }
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp
index f4340475f573c3c8f4c108f8b7bfacef0d72af4e..f3be4bedb794884998639eb9a313db5079bebdd2 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp
@@ -71,6 +71,8 @@ TEST(TEST_CATEGORY, atomic_operations_unsignedlong) {
                  unsigned long int, TEST_EXECSPACE>(start, end - i, 11)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
                  unsigned long int, TEST_EXECSPACE>(start, end - i, 12)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                 unsigned long int, TEST_EXECSPACE>(start, end - i, 13)));
   }
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomicViews.hpp b/packages/kokkos/core/unit_test/TestAtomicViews.hpp
index b615b407f334a60d187bbc3c27b3c69110acc94c..e029ad81f576f25333470e6078eee9445abba3ec 100644
--- a/packages/kokkos/core/unit_test/TestAtomicViews.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicViews.hpp
@@ -245,11 +245,11 @@ class TestAtomicViewAPI {
     ASSERT_EQ(ax.use_count(), size_t(4));
     ASSERT_EQ(const_ax.use_count(), ax.use_count());
 
-    ASSERT_FALSE(ax.data() == nullptr);
-    ASSERT_FALSE(const_ax.data() == nullptr);  // referenceable ptr
-    ASSERT_FALSE(unmanaged_ax.data() == nullptr);
-    ASSERT_FALSE(unmanaged_ax_from_ptr_dx.data() == nullptr);
-    ASSERT_FALSE(ay.data() == nullptr);
+    ASSERT_NE(ax.data(), nullptr);
+    ASSERT_NE(const_ax.data(), nullptr);  // referenceable ptr
+    ASSERT_NE(unmanaged_ax.data(), nullptr);
+    ASSERT_NE(unmanaged_ax_from_ptr_dx.data(), nullptr);
+    ASSERT_NE(ay.data(), nullptr);
     //    ASSERT_NE( ax, ay );
     //    Above test results in following runtime error from gtest:
     //    Expected: (ax) != (ay), actual: 32-byte object <30-01 D0-A0 D8-7F
@@ -278,7 +278,7 @@ class TestAtomicViewAPI {
                          Kokkos::MemoryTraits<Kokkos::Atomic> >& arg_const,
       const Kokkos::View<const DataType, device,
                          Kokkos::MemoryTraits<Kokkos::Atomic> >& arg) {
-    ASSERT_TRUE(arg_const == arg);
+    ASSERT_EQ(arg_const, arg);
   }
 
   static void run_test_const() {
@@ -290,8 +290,8 @@ class TestAtomicViewAPI {
     typeX x("X");
     const_typeX xc = x;
 
-    // ASSERT_TRUE( xc == x ); // const xc is referenceable, non-const x is not
-    // ASSERT_TRUE( x == xc );
+    // ASSERT_EQ( xc ,  x ); // const xc is referenceable, non-const x is not
+    // ASSERT_EQ( x ,  xc );
 
     check_auto_conversion_to_const(x, xc);
   }
diff --git a/packages/kokkos/core/unit_test/TestAtomics.hpp b/packages/kokkos/core/unit_test/TestAtomics.hpp
index e41ad5257d64ad3acb3266a0354f18d291662377..f2993914a11560f30cf70aabacf444f3b38e9bfc 100644
--- a/packages/kokkos/core/unit_test/TestAtomics.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomics.hpp
@@ -97,7 +97,7 @@ struct SuperScalar {
   }
 
   KOKKOS_INLINE_FUNCTION
-  SuperScalar operator+(const SuperScalar& src) {
+  SuperScalar operator+(const SuperScalar& src) const {
     SuperScalar tmp = *this;
     for (int i = 0; i < N; i++) {
       tmp.val[i] += src.val[i];
@@ -540,8 +540,6 @@ TEST(TEST_CATEGORY, atomics) {
 
 // FIXME_SYCL atomics for large types to be implemented
 #ifndef KOKKOS_ENABLE_SYCL
-  // FIXME_HIP HIP doesn't yet support atomics for >64bit types properly
-#ifndef KOKKOS_ENABLE_HIP
   ASSERT_TRUE(
       (TestAtomic::Loop<Kokkos::complex<double>, TEST_EXECSPACE>(1, 1)));
   ASSERT_TRUE(
@@ -567,7 +565,6 @@ TEST(TEST_CATEGORY, atomics) {
 #endif
 #endif
 #endif
-#endif
 }
 
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestComplex.hpp b/packages/kokkos/core/unit_test/TestComplex.hpp
index b926058ebf990b0c7d0bff6f4c22b5bd4c12e2e8..be0c1e50d7013efc177f427b481a4b67b1441744 100644
--- a/packages/kokkos/core/unit_test/TestComplex.hpp
+++ b/packages/kokkos/core/unit_test/TestComplex.hpp
@@ -515,4 +515,44 @@ TEST(TEST_CATEGORY, complex_issue_3867) {
 #undef CHECK_POW_COMPLEX_PROMOTION
 }
 
+TEST(TEST_CATEGORY, complex_operations_arithmetic_types_overloads) {
+#define STATIC_ASSERT(cond) static_assert(cond, "")
+
+  STATIC_ASSERT(Kokkos::real(1) == 1.);
+  STATIC_ASSERT(Kokkos::real(2.f) == 2.f);
+  STATIC_ASSERT(Kokkos::real(3.) == 3.);
+  STATIC_ASSERT(Kokkos::real(4.l) == 4.l);
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::real(1)), double>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::real(2.f)), float>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::real(3.)), double>::value));
+  STATIC_ASSERT(
+      (std::is_same<decltype(Kokkos::real(4.l)), long double>::value));
+
+  STATIC_ASSERT(Kokkos::imag(1) == 0.);
+  STATIC_ASSERT(Kokkos::imag(2.f) == 0.f);
+  STATIC_ASSERT(Kokkos::imag(3.) == 0.);
+  STATIC_ASSERT(Kokkos::imag(4.l) == 0.l);
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::imag(1)), double>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::imag(2.f)), float>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::imag(3.)), double>::value));
+  STATIC_ASSERT(
+      (std::is_same<decltype(Kokkos::real(4.l)), long double>::value));
+
+  // FIXME in principle could be checked at compile time too
+  ASSERT_EQ(Kokkos::conj(1), Kokkos::complex<double>(1));
+  ASSERT_EQ(Kokkos::conj(2.f), Kokkos::complex<float>(2.f));
+  ASSERT_EQ(Kokkos::conj(3.), Kokkos::complex<double>(3.));
+  ASSERT_EQ(Kokkos::conj(4.l), Kokkos::complex<long double>(4.l));
+  STATIC_ASSERT((
+      std::is_same<decltype(Kokkos::conj(1)), Kokkos::complex<double>>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::conj(2.f)),
+                              Kokkos::complex<float>>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::conj(3.)),
+                              Kokkos::complex<double>>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::conj(4.l)),
+                              Kokkos::complex<long double>>::value));
+
+#undef STATIC_ASSERT
+}
+
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp b/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp
index 49f8daf89eabca9b3aa7e1f06d7a10ceb23a6a24..f487a015fbf261f85bf2b8a0b4755dadcefe2f32 100644
--- a/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp
+++ b/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp
@@ -296,7 +296,7 @@ struct TestDeepCopyScalarConversion {
 
     int64_t errors = 0;
     Kokkos::deep_copy(errors, error_count);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
 
     Kokkos::deep_copy(view_s1_1d, static_cast<Scalar1>(0));
     Kokkos::deep_copy(view_s1_2d, static_cast<Scalar1>(0));
@@ -306,7 +306,7 @@ struct TestDeepCopyScalarConversion {
                                              Kokkos::IndexType<int64_t>>(0, N0),
                          *this);
     Kokkos::deep_copy(errors, error_count);
-    ASSERT_TRUE(errors > 0);
+    ASSERT_GT(errors, 0);
 
     Kokkos::deep_copy(error_count, 0);
     Kokkos::deep_copy(TEST_EXECSPACE(), view_s1_1d, view_s2_1d);
@@ -318,7 +318,7 @@ struct TestDeepCopyScalarConversion {
                          *this);
 
     Kokkos::deep_copy(errors, error_count);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 };
 }  // namespace Impl
diff --git a/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp b/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
index 8a9263c8df5fcd1eab370837d9d2de92281e7aaa..90e485998ec08ba98716185298860fb4c407daf2 100644
--- a/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
+++ b/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
@@ -79,7 +79,7 @@ char** init_kokkos_args(bool do_threads, bool do_numa, bool do_device,
   int numa_idx    = (do_other ? 3 : 0) + (do_threads ? 1 : 0);
   int device_idx =
       (do_other ? 3 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0);
-  int tune_idx = (do_other ? 3 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0) +
+  int tune_idx = (do_other ? 4 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0) +
                  (do_device ? 1 : 0);
 
   if (do_threads) {
diff --git a/packages/kokkos/core/unit_test/TestDetectionIdiom.cpp b/packages/kokkos/core/unit_test/TestDetectionIdiom.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f87fda615643c5a75ef5ee4da7349bab0eea40cd
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestDetectionIdiom.cpp
@@ -0,0 +1,96 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_DetectionIdiom.hpp>
+
+#define STATIC_ASSERT(cond) static_assert(cond, "");
+
+void test_nonesuch() {
+  using Kokkos::nonesuch;
+  STATIC_ASSERT(!std::is_constructible<nonesuch>::value);
+  STATIC_ASSERT(!std::is_destructible<nonesuch>::value);
+  STATIC_ASSERT(!std::is_copy_constructible<nonesuch>::value);
+  STATIC_ASSERT(!std::is_move_constructible<nonesuch>::value);
+#ifdef KOKKOS_ENABLE_CXX17
+  STATIC_ASSERT(!std::is_aggregate<nonesuch>::value);
+#endif
+}
+
+#undef STATIC_ASSERT
+
+namespace Example {
+// Example from https://en.cppreference.com/w/cpp/experimental/is_detected
+template <class T>
+using copy_assign_t = decltype(std::declval<T&>() = std::declval<const T&>());
+
+struct Meow {};
+struct Purr {
+  void operator=(const Purr&) = delete;
+};
+
+static_assert(Kokkos::is_detected<copy_assign_t, Meow>::value,
+              "Meow should be copy assignable!");
+static_assert(!Kokkos::is_detected<copy_assign_t, Purr>::value,
+              "Purr should not be copy assignable!");
+static_assert(Kokkos::is_detected_exact<Meow&, copy_assign_t, Meow>::value,
+              "Copy assignment of Meow should return Meow&!");
+
+template <class T>
+using diff_t = typename T::difference_type;
+
+template <class Ptr>
+using difference_type = Kokkos::detected_or_t<std::ptrdiff_t, diff_t, Ptr>;
+
+struct Woof {
+  using difference_type = int;
+};
+struct Bark {};
+
+static_assert(std::is_same<difference_type<Woof>, int>::value,
+              "Woof's difference_type should be int!");
+static_assert(std::is_same<difference_type<Bark>, std::ptrdiff_t>::value,
+              "Bark's difference_type should be ptrdiff_t!");
+}  // namespace Example
+
+int main() {}
diff --git a/packages/kokkos/core/unit_test/TestExecSpacePartitioning.hpp b/packages/kokkos/core/unit_test/TestExecSpacePartitioning.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f8f5275d3dd66ce42256bf61637112d026687a3a
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestExecSpacePartitioning.hpp
@@ -0,0 +1,129 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+namespace {
+struct SumFunctor {
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i, int& lsum) const { lsum += i; }
+};
+
+template <class ExecSpace>
+void check_distinctive(ExecSpace, ExecSpace) {}
+
+#ifdef KOKKOS_ENABLE_CUDA
+void check_distinctive(Kokkos::Cuda exec1, Kokkos::Cuda exec2) {
+  ASSERT_NE(exec1.cuda_stream(), exec2.cuda_stream());
+}
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+void check_distinctive(Kokkos::Experimental::HIP exec1,
+                       Kokkos::Experimental::HIP exec2) {
+  ASSERT_NE(exec1.hip_stream(), exec2.hip_stream());
+}
+#endif
+#ifdef KOKKOS_ENABLE_SYCL
+void check_distinctive(Kokkos::Experimental::SYCL exec1,
+                       Kokkos::Experimental::SYCL exec2) {
+  ASSERT_NE(*exec1.impl_internal_space_instance()->m_queue,
+            *exec2.impl_internal_space_instance()->m_queue);
+}
+#endif
+}  // namespace
+
+void test_partitioning(std::vector<TEST_EXECSPACE>& instances) {
+  check_distinctive(instances[0], instances[1]);
+  int sum1, sum2;
+  int N = 3910;
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<TEST_EXECSPACE>(instances[0], 0, N), SumFunctor(),
+      sum1);
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<TEST_EXECSPACE>(instances[1], 0, N), SumFunctor(),
+      sum2);
+  ASSERT_EQ(sum1, sum2);
+  ASSERT_EQ(sum1, N * (N - 1) / 2);
+
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \
+    defined(KOKKOS_ENABLE_SYCL)
+  // Eliminate unused function warning
+  // (i.e. when compiling for Serial and CUDA, during Serial compilation the
+  // Cuda overload is unused ...)
+  if (sum1 != sum2) {
+#ifdef KOKKOS_ENABLE_CUDA
+    check_distinctive(Kokkos::Cuda(), Kokkos::Cuda());
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+    check_distinctive(Kokkos::Experimental::HIP(), Kokkos::Experimental::HIP());
+#endif
+#ifdef KOKKOS_ENABLE_SYCL
+    check_distinctive(Kokkos::Experimental::SYCL(),
+                      Kokkos::Experimental::SYCL());
+#endif
+  }
+#endif
+}
+
+TEST(TEST_CATEGORY, partitioning_by_args) {
+  auto instances =
+      Kokkos::Experimental::partition_space(TEST_EXECSPACE(), 1, 1.);
+  ASSERT_EQ(int(instances.size()), 2);
+  test_partitioning(instances);
+}
+
+TEST(TEST_CATEGORY, partitioning_by_vector) {
+  std::vector<int> weights{1, 1};
+  auto instances =
+      Kokkos::Experimental::partition_space(TEST_EXECSPACE(), weights);
+  ASSERT_EQ(int(instances.size()), 2);
+  test_partitioning(instances);
+}
+}  // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_c.cpp b/packages/kokkos/core/unit_test/TestExecutionSpace.hpp
similarity index 68%
rename from packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_c.cpp
rename to packages/kokkos/core/unit_test/TestExecutionSpace.hpp
index 26dc9b0e000096ab1809412c4a29fc563844cbd1..8e4331e809a806f8b7931735e445706abc2e06c5 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_c.cpp
+++ b/packages/kokkos/core/unit_test/TestExecutionSpace.hpp
@@ -42,5 +42,39 @@
 //@HEADER
 */
 
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewAPI_c.hpp>
+#include <cstdio>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+
+namespace {
+
+struct StructCopy {
+  Kokkos::DefaultExecutionSpace device;
+  Kokkos::DefaultHostExecutionSpace host;
+};
+
+template <class ExecutionSpace>
+void check_struct_copy() {
+#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
+  // FIXME_OPENMPTARGET nvlink error: Undefined reference to
+  // '_ZSt25__throw_bad_function_callv' in
+  // '/tmp/TestOpenMPTarget_ExecutionSpace-434d81.cubin'
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
+  StructCopy data;
+  parallel_for(
+      Kokkos::RangePolicy<ExecutionSpace>(0, 1), KOKKOS_LAMBDA(int) {
+        StructCopy data2 = data;
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF("%i \n", data2.device.in_parallel());
+      });
+#endif
+#endif
+}
+
+}  // namespace
+
+TEST(TEST_CATEGORY, copy_structure) { check_struct_copy<TEST_EXECSPACE>(); }
+}  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestHalfConversion.hpp b/packages/kokkos/core/unit_test/TestHalfConversion.hpp
index 277fb1b04234e58b0fe3d1639e48fcd1dc51ff86..992f56cc6b833882676f71817bae3b6bd03631d6 100644
--- a/packages/kokkos/core/unit_test/TestHalfConversion.hpp
+++ b/packages/kokkos/core/unit_test/TestHalfConversion.hpp
@@ -53,7 +53,7 @@ void test_half_conversion_type() {
   T base                         = static_cast<T>(3.3);
   Kokkos::Experimental::half_t a = Kokkos::Experimental::cast_to_half(base);
   T b                            = Kokkos::Experimental::cast_from_half<T>(a);
-  ASSERT_TRUE((double(b - base) / double(base)) < epsilon);
+  ASSERT_LT((double(b - base) / double(base)), epsilon);
 
 // TODO: Remove ifndef once https://github.com/kokkos/kokkos/pull/3480 merges
 #ifndef KOKKOS_ENABLE_SYCL
@@ -67,7 +67,7 @@ void test_half_conversion_type() {
       });
 
   Kokkos::deep_copy(b, b_v);
-  ASSERT_TRUE((double(b - base) / double(base)) < epsilon);
+  ASSERT_LT((double(b - base) / double(base)), epsilon);
 #endif  // KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
 #endif  // KOKKOS_ENABLE_SYCL
 }
diff --git a/packages/kokkos/core/unit_test/TestHalfOperators.hpp b/packages/kokkos/core/unit_test/TestHalfOperators.hpp
index db52a05d5d36d5919e101f60dd7652c92771c885..c4cf8a745701897a2a36c38540a94149650b5e2d 100644
--- a/packages/kokkos/core/unit_test/TestHalfOperators.hpp
+++ b/packages/kokkos/core/unit_test/TestHalfOperators.hpp
@@ -269,6 +269,85 @@ enum OP_TESTS {
   N_OP_TESTS
 };
 
+template <class view_type>
+struct Functor_TestHalfVolatileOperators {
+  volatile half_t h_lhs, h_rhs;
+  view_type actual_lhs, expected_lhs;
+  double d_lhs, d_rhs;
+  Functor_TestHalfVolatileOperators(volatile half_t lhs = half_t(0),
+                                    volatile half_t rhs = half_t(0))
+      : h_lhs(lhs), h_rhs(rhs) {
+    actual_lhs   = view_type("actual_lhs", N_OP_TESTS);
+    expected_lhs = view_type("expected_lhs", N_OP_TESTS);
+    d_lhs        = cast_from_half<double>(h_lhs);
+    d_rhs        = cast_from_half<double>(h_rhs);
+    if (std::is_same<view_type, ViewTypeHost>::value) {
+      auto run_on_host = *this;
+      run_on_host(0);
+    } else {
+      Kokkos::parallel_for("Test::Functor_TestHalfVolatileOperators",
+                           Kokkos::RangePolicy<ExecutionSpace>(0, 1), *this);
+    }
+  }
+
+  KOKKOS_FUNCTION
+  void operator()(int) const {
+    volatile half_t tmp_lhs;
+
+    // Initialze output views to catch missing test invocations
+    for (int i = 0; i < N_OP_TESTS; ++i) {
+      actual_lhs(i)   = 1;
+      expected_lhs(i) = -1;
+    }
+
+    tmp_lhs              = h_lhs;
+    actual_lhs(ASSIGN)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(ASSIGN) = d_lhs;
+
+    actual_lhs(LT)   = h_lhs < h_rhs;
+    expected_lhs(LT) = d_lhs < d_rhs;
+
+    actual_lhs(LE)   = h_lhs <= h_rhs;
+    expected_lhs(LE) = d_lhs <= d_rhs;
+
+    actual_lhs(NEQ)   = h_lhs != h_rhs;
+    expected_lhs(NEQ) = d_lhs != d_rhs;
+
+    actual_lhs(GT)   = h_lhs > h_rhs;
+    expected_lhs(GT) = d_lhs > d_rhs;
+
+    actual_lhs(GE)   = h_lhs >= h_rhs;
+    expected_lhs(GE) = d_lhs >= d_rhs;
+
+    actual_lhs(EQ)   = h_lhs == h_rhs;
+    expected_lhs(EQ) = d_lhs == d_rhs;
+
+    tmp_lhs = h_lhs;
+    tmp_lhs += h_rhs;
+    actual_lhs(CADD_H_H)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(CADD_H_H) = d_lhs;
+    expected_lhs(CADD_H_H) += d_rhs;
+
+    tmp_lhs = h_lhs;
+    tmp_lhs -= h_rhs;
+    actual_lhs(CSUB_H_H)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(CSUB_H_H) = d_lhs;
+    expected_lhs(CSUB_H_H) -= d_rhs;
+
+    tmp_lhs = h_lhs;
+    tmp_lhs *= h_rhs;
+    actual_lhs(CMUL_H_H)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(CMUL_H_H) = d_lhs;
+    expected_lhs(CMUL_H_H) *= d_rhs;
+
+    tmp_lhs = h_lhs;
+    tmp_lhs /= h_rhs;
+    actual_lhs(CDIV_H_H)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(CDIV_H_H) = d_lhs;
+    expected_lhs(CDIV_H_H) /= d_rhs;
+  }
+};
+
 template <class view_type>
 struct Functor_TestHalfOperators {
   half_t h_lhs, h_rhs;
@@ -840,8 +919,33 @@ void __test_half_operators(half_t h_lhs, half_t h_rhs) {
                 epsilon);
   }
 
-  // Check whether half_t is trivially copyable
-  ASSERT_TRUE(std::is_trivially_copyable<half_t>::value);
+  // Test partial volatile support
+  volatile half_t _h_lhs = h_lhs;
+  volatile half_t _h_rhs = h_rhs;
+  Functor_TestHalfVolatileOperators<ViewType> f_volatile_device(_h_lhs, _h_rhs);
+  Functor_TestHalfVolatileOperators<ViewTypeHost> f_volatile_host(_h_lhs,
+                                                                  _h_rhs);
+
+  ExecutionSpace().fence();
+  Kokkos::deep_copy(f_device_actual_lhs, f_device.actual_lhs);
+  Kokkos::deep_copy(f_device_expected_lhs, f_device.expected_lhs);
+  for (int op_test = 0; op_test < N_OP_TESTS; op_test++) {
+    // printf("op_test = %d\n", op_test);
+    if (op_test == ASSIGN || op_test == LT || op_test == LE || op_test == NEQ ||
+        op_test == EQ || op_test == GT || op_test == GE ||
+        op_test == CADD_H_H || op_test == CSUB_H_H || op_test == CMUL_H_H ||
+        op_test == CDIV_H_H) {
+      ASSERT_NEAR(f_device_actual_lhs(op_test), f_device_expected_lhs(op_test),
+                  epsilon);
+      ASSERT_NEAR(f_host.actual_lhs(op_test), f_host.expected_lhs(op_test),
+                  epsilon);
+    }
+  }
+
+  // is_trivially_copyable is false with the addition of explicit
+  // copy constructors that are required for supporting reductions
+  // ASSERT_TRUE(std::is_trivially_copyable<half_t>::value);
+
   constexpr size_t n       = 2;
   constexpr size_t n_bytes = sizeof(half_t) * n;
   const half_t h_arr0 = half_t(0x89ab), h_arr1 = half_t(0xcdef);
@@ -854,11 +958,11 @@ void __test_half_operators(half_t h_lhs, half_t h_rhs) {
   h_arr_ptr = reinterpret_cast<char*>(h_arr);
 
   std::memcpy(c_arr, h_arr, n_bytes);
-  for (i = 0; i < n_bytes; i++) ASSERT_TRUE(c_arr[i] == h_arr_ptr[i]);
+  for (i = 0; i < n_bytes; i++) ASSERT_EQ(c_arr[i], h_arr_ptr[i]);
 
   std::memcpy(h_arr, c_arr, n_bytes);
-  ASSERT_TRUE(h_arr[0] == h_arr0);
-  ASSERT_TRUE(h_arr[1] == h_arr1);
+  ASSERT_EQ(h_arr[0], h_arr0);
+  ASSERT_EQ(h_arr[1], h_arr1);
 }
 
 void test_half_operators() {
@@ -870,7 +974,6 @@ void test_half_operators() {
     // TODO: __test_half_operators(h_lhs + cast_to_half(i + 1), half_t(0));
     // TODO: __test_half_operators(half_t(0), h_rhs + cast_to_half(i));
   }
-  // TODO: __test_half_operators(0, 0);
 }
 
 TEST(TEST_CATEGORY, half_operators) { test_half_operators(); }
diff --git a/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp b/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp
index 18d1ac85188ca17cd7d127d3187103f42402be18..10180251ba582e9d11672ce74b4d22335c9da3d4 100644
--- a/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp
+++ b/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp
@@ -52,14 +52,17 @@ using Kokkos::Impl::HostSharedPtr;
 namespace {
 
 class Data {
-  Kokkos::Array<char, 64> d;
+  char d[64];
 
  public:
-  KOKKOS_FUNCTION void write(char const* c) {
-    for (int i = 0; i < 64 && c; ++i, ++c) {
-      d[i] = *c;
-    }
+  // Because strncpy is not supported within device code
+  static KOKKOS_FUNCTION void my_strncpy(char* dst, const char* src,
+                                         size_t cnt) {
+    while (cnt-- > 0 && (*dst++ = *src++) != '\0')
+      ;
+    while (cnt-- > 0) *dst++ = '\0';
   }
+  KOKKOS_FUNCTION void write(char const* s) { my_strncpy(d, s, sizeof(d)); }
 };
 
 template <class SmartPtr>
@@ -154,3 +157,135 @@ TEST(TEST_CATEGORY, host_shared_ptr_special_members_on_device) {
   check_special_members_on_device(device_ptr);
 }
 #endif
+
+// FIXME_OPENMPTARGET
+#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) && \
+    !defined(KOKKOS_ENABLE_OPENMPTARGET)
+namespace {
+
+struct Bar {
+  double val;
+};
+
+struct Foo {
+  Foo(bool allocate = false) : ptr(allocate ? new Bar : nullptr) {}
+  Kokkos::Impl::HostSharedPtr<Bar> ptr;
+  int use_count() { return ptr.use_count(); }
+};
+
+template <class DevMemSpace, class HostMemSpace>
+void host_shared_ptr_test_reference_counting() {
+  using ExecSpace = typename DevMemSpace::execution_space;
+  bool is_gpu =
+      !Kokkos::SpaceAccessibility<ExecSpace, Kokkos::HostSpace>::accessible;
+
+  // Create two tracked instances
+  Foo f1(true), f2(true);
+  // Scope Views
+  {
+    Foo* fp_d_ptr =
+        static_cast<Foo*>(Kokkos::kokkos_malloc<DevMemSpace>(sizeof(Foo)));
+    Kokkos::View<Foo, DevMemSpace> fp_d(fp_d_ptr);
+    // If using UVM or on the CPU don't make an extra HostCopy
+    Foo* fp_h_ptr = std::is_same<DevMemSpace, HostMemSpace>::value
+                        ? fp_d_ptr
+                        : static_cast<Foo*>(
+                              Kokkos::kokkos_malloc<HostMemSpace>(sizeof(Foo)));
+    Kokkos::View<Foo, HostMemSpace> fp_h(fp_h_ptr);
+    ASSERT_EQ(1, f1.use_count());
+    ASSERT_EQ(1, f2.use_count());
+
+    // Just for the sake of it initialize the data of the host copy
+    new (fp_h.data()) Foo();
+    // placement new in kernel
+    //  if on GPU: should not increase use_count, fp_d will not be tracked
+    //  if on Host: refcount will increase fp_d is tracked
+    Kokkos::parallel_for(
+        Kokkos::RangePolicy<ExecSpace>(0, 1),
+        KOKKOS_LAMBDA(int) { new (fp_d.data()) Foo(f1); });
+    Kokkos::fence();
+    Kokkos::deep_copy(fp_h, fp_d);
+
+    if (is_gpu)
+      ASSERT_EQ(1, f1.use_count());
+    else
+      ASSERT_EQ(2, f1.use_count());
+    ASSERT_EQ(1, f2.use_count());
+
+    // assignment operator on host, will increase f2 use_count
+    //   if default device is GPU: fp_h was untracked
+    //   if default device is CPU: fp_h was tracked and use_count was 2 for
+    //   aliasing f1, in which case use_count will be decreased here
+    fp_h() = f2;
+    ASSERT_EQ(1, f1.use_count());
+    ASSERT_EQ(2, f2.use_count());
+
+    Kokkos::deep_copy(fp_d, fp_h);
+    ASSERT_EQ(1, f1.use_count());
+    ASSERT_EQ(2, f2.use_count());
+
+    // assignment in kernel:
+    //  If on GPU: should not increase use_count of f1 and fp_d will not be
+    //  tracked.
+    //  If on Host: use_count will increase of f1, fp_d is tracked,
+    //  use_count of f2 goes down.
+    //  Since we are messing with the use count on the device: make host copy
+    //  untracked first. Note if fp_d and fp_h alias each other (e.g. compiling
+    //  for CPU only) that means fp_d() will be untracked too during assignemnt
+    fp_h() = Foo();
+    Kokkos::parallel_for(
+        Kokkos::RangePolicy<ExecSpace>(0, 1),
+        KOKKOS_LAMBDA(int) { fp_d() = f1; });
+    Kokkos::fence();
+    Kokkos::deep_copy(fp_h, fp_d);
+
+    if (is_gpu)
+      ASSERT_EQ(1, f1.use_count());
+    else
+      ASSERT_EQ(2, f1.use_count());
+    ASSERT_EQ(1, f2.use_count());
+
+    // Assign non-tracked ptr
+    //   if  if_gpu will not change use_count
+    //   if !is_gpu will decrease use_count of f1
+    fp_h() = Foo();
+    ASSERT_EQ(1, f1.use_count());
+    ASSERT_EQ(1, f2.use_count());
+    fp_h() = f2;
+    ASSERT_EQ(1, f1.use_count());
+    ASSERT_EQ(2, f2.use_count());
+
+    // before deleting host version make sure its not tracked
+    fp_h() = Foo();
+    if (fp_h_ptr != fp_d_ptr) Kokkos::kokkos_free<HostMemSpace>(fp_h_ptr);
+    Kokkos::kokkos_free<DevMemSpace>(fp_d_ptr);
+  }
+
+  ASSERT_EQ(1, f1.use_count());
+  ASSERT_EQ(1, f2.use_count());
+}
+}  // namespace
+
+TEST(TEST_CATEGORY, host_shared_ptr_tracking) {
+  host_shared_ptr_test_reference_counting<typename TEST_EXECSPACE::memory_space,
+                                          Kokkos::HostSpace>();
+#ifdef KOKKOS_ENABLE_CUDA
+  if (std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value)
+    host_shared_ptr_test_reference_counting<Kokkos::CudaUVMSpace,
+                                            Kokkos::CudaUVMSpace>();
+#endif
+#ifdef KOKKOS_ENABLE_SYCL
+  if (std::is_same<TEST_EXECSPACE, Kokkos::Experimental::SYCL>::value)
+    host_shared_ptr_test_reference_counting<
+        Kokkos::Experimental::SYCLSharedUSMSpace,
+        Kokkos::Experimental::SYCLSharedUSMSpace>();
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+  if (std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value)
+    host_shared_ptr_test_reference_counting<
+        Kokkos::Experimental::HIPHostPinnedSpace,
+        Kokkos::Experimental::HIPHostPinnedSpace>();
+#endif
+}
+
+#endif  // KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
diff --git a/packages/kokkos/core/unit_test/TestInterOp.cpp b/packages/kokkos/core/unit_test/TestInterOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7f08afada9d7c74e6d949949a6f52c2b7e3b7ada
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestInterOp.cpp
@@ -0,0 +1,162 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_DynRankView.hpp>
+#include <KokkosExp_InterOp.hpp>
+
+// View
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<Kokkos::View<double*>>,
+        Kokkos::View<
+            double*, typename Kokkos::DefaultExecutionSpace::array_layout,
+            typename Kokkos::DefaultExecutionSpace::memory_space>>::value,
+    "Error! Unexpected python_view_type for: View");
+
+// DynRankView
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<Kokkos::DynRankView<double>>,
+        Kokkos::DynRankView<
+            double, typename Kokkos::DefaultExecutionSpace::array_layout,
+            typename Kokkos::DefaultExecutionSpace::memory_space>>::value,
+    "Error! Unexpected python_view_type for: DynRankView");
+
+// View + Execution Space
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<
+            Kokkos::View<double*, Kokkos::DefaultExecutionSpace>>,
+        Kokkos::View<
+            double*, typename Kokkos::DefaultExecutionSpace::array_layout,
+            typename Kokkos::DefaultExecutionSpace::memory_space>>::value,
+    "Error! Unexpected python_view_type for: View + Execution Space");
+
+// DynRankView + Execution Space
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<
+            Kokkos::DynRankView<double, Kokkos::DefaultExecutionSpace>>,
+        Kokkos::DynRankView<
+            double, typename Kokkos::DefaultExecutionSpace::array_layout,
+            typename Kokkos::DefaultExecutionSpace::memory_space>>::value,
+    "Error! Unexpected python_view_type for: DynRankView + Execution Space");
+
+// View + Memory space
+static_assert(std::is_same<Kokkos::Experimental::python_view_type_t<
+                               Kokkos::View<int64_t*, Kokkos::HostSpace>>,
+                           Kokkos::View<int64_t*, Kokkos::LayoutRight,
+                                        Kokkos::HostSpace>>::value,
+              "Error! Unexpected python_view_type for: View + Memory space");
+
+// DynRankView + Memory space
+static_assert(
+    std::is_same<Kokkos::Experimental::python_view_type_t<
+                     Kokkos::DynRankView<int16_t, Kokkos::HostSpace>>,
+                 Kokkos::DynRankView<int16_t, Kokkos::LayoutRight,
+                                     Kokkos::HostSpace>>::value,
+    "Error! Unexpected python_view_type for: DynRankView + Memory space");
+
+// View + Layout + Execution space
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<Kokkos::View<
+            int**, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace>>,
+        Kokkos::View<int**, Kokkos::LayoutLeft,
+                     typename Kokkos::DefaultExecutionSpace::memory_space>>::
+        value,
+    "Error! Unexpected python_view_type for: View + Layout + Execution space");
+
+// DynRankView + Layout + Execution space
+static_assert(
+    std::is_same<Kokkos::Experimental::python_view_type_t<Kokkos::DynRankView<
+                     int, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace>>,
+                 Kokkos::DynRankView<int, Kokkos::LayoutLeft,
+                                     typename Kokkos::DefaultExecutionSpace::
+                                         memory_space>>::value,
+    "Error! Unexpected python_view_type for: DynRankView + Layout + Execution "
+    "space");
+
+// View + Layout + Memory Space
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<
+            Kokkos::View<uint32_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>>,
+        Kokkos::View<uint32_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>>::value,
+    "Error! Unexpected python_view_type for: View + Layout + Memory Space");
+
+// DynRankView + Layout + Memory Space
+static_assert(
+    std::is_same<Kokkos::Experimental::python_view_type_t<Kokkos::DynRankView<
+                     uint64_t, Kokkos::LayoutLeft, Kokkos::HostSpace>>,
+                 Kokkos::DynRankView<uint64_t, Kokkos::LayoutLeft,
+                                     Kokkos::HostSpace>>::value,
+    "Error! Unexpected python_view_type for: DynRankView + Layout + Memory "
+    "Space");
+
+// View + Layout + Execution space + Memory Trait
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<Kokkos::View<
+            float***, Kokkos::LayoutLeft, Kokkos::DefaultHostExecutionSpace,
+            Kokkos::MemoryTraits<Kokkos::RandomAccess>>>,
+        Kokkos::View<float***, Kokkos::LayoutLeft,
+                     typename Kokkos::DefaultHostExecutionSpace::memory_space,
+                     Kokkos::MemoryTraits<Kokkos::RandomAccess>>>::value,
+    "Error! Unexpected python_view_type for: View + Layout + Execution space + "
+    "Memory Trait");
+
+// DynRankView + Layout + Execution space  + Memory trait
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<Kokkos::DynRankView<
+            float, Kokkos::LayoutLeft, Kokkos::DefaultHostExecutionSpace,
+            Kokkos::MemoryTraits<Kokkos::Atomic>>>,
+        Kokkos::DynRankView<
+            float, Kokkos::LayoutLeft,
+            typename Kokkos::DefaultHostExecutionSpace::memory_space,
+            Kokkos::MemoryTraits<Kokkos::Atomic>>>::value,
+    "Error! Unexpected python_view_type for: DynRankView + Layout + Execution "
+    "space  + Memory trait");
diff --git a/packages/kokkos/core/unit_test/TestMDRange.hpp b/packages/kokkos/core/unit_test/TestMDRange.hpp
index 5618e40989b185a0233de2b20d6dec6636c9fe51..57461be714cde62bdd9b370c834759b91a13da92 100644
--- a/packages/kokkos/core/unit_test/TestMDRange.hpp
+++ b/packages/kokkos/core/unit_test/TestMDRange.hpp
@@ -2751,9 +2751,18 @@ struct TestMDRange_6D {
                            const int N3, const int N4, const int N5) {
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<128, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -2772,9 +2781,18 @@ struct TestMDRange_6D {
 #endif
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -2807,9 +2825,18 @@ struct TestMDRange_6D {
 
     // Test with reducers - scalar
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
 #ifdef KOKKOS_ENABLE_SYCL
       range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}},
                        {{3, 3, 3, 2, 2, 2}});
@@ -2832,9 +2859,18 @@ struct TestMDRange_6D {
 
     // Test with reducers - scalar + label
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
 
 #ifdef KOKKOS_ENABLE_SYCL
       range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}},
@@ -2858,9 +2894,19 @@ struct TestMDRange_6D {
 
     // Test with reducers - scalar view
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type =
+          typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
+                                         Kokkos::IndexType<int>,
+                                         Kokkos::LaunchBounds<512, 1>>;
+#endif
 #ifdef KOKKOS_ENABLE_SYCL
       range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}},
                        {{3, 3, 3, 2, 2, 2}});
@@ -2888,9 +2934,18 @@ struct TestMDRange_6D {
     // Test Min reducer with lambda
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<128, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
       range_type range({{1, 1, 1, 1, 1, 1}}, {{N0, N1, N2, N3, N4, N5}},
                        {{3, 3, 3, 2, 2, 1}});
 
@@ -2923,9 +2978,19 @@ struct TestMDRange_6D {
 
     // Tagged operator test
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type = typename Kokkos::MDRangePolicy<
           ExecSpace, Kokkos::Rank<6, Iterate::Default, Iterate::Default>,
           Kokkos::IndexType<int>, InitTag>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>,
+          Kokkos::Rank<6, Iterate::Default, Iterate::Default>,
+          Kokkos::IndexType<int>, InitTag>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -2977,9 +3042,18 @@ struct TestMDRange_6D {
                         const int N4, const int N5) {
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<128, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3028,8 +3102,16 @@ struct TestMDRange_6D {
 #endif
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>>;
+#endif
       using point_type = typename range_type::point_type;
 
       range_type range(point_type{{0, 0, 0, 0, 0, 0}},
@@ -3062,9 +3144,18 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>, InitTag>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>, InitTag>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3115,9 +3206,18 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3158,9 +3258,19 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type = typename Kokkos::MDRangePolicy<
           ExecSpace, Kokkos::Rank<6, Iterate::Default, Iterate::Default>,
           Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>,
+          Kokkos::Rank<6, Iterate::Default, Iterate::Default>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3201,9 +3311,19 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type = typename Kokkos::MDRangePolicy<
           ExecSpace, Kokkos::Rank<6, Iterate::Left, Iterate::Left>,
           Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>,
+          Kokkos::Rank<6, Iterate::Left, Iterate::Left>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3244,9 +3364,19 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type = typename Kokkos::MDRangePolicy<
           ExecSpace, Kokkos::Rank<6, Iterate::Left, Iterate::Right>,
           Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>,
+          Kokkos::Rank<6, Iterate::Left, Iterate::Right>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3287,9 +3417,19 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type = typename Kokkos::MDRangePolicy<
           ExecSpace, Kokkos::Rank<6, Iterate::Right, Iterate::Left>,
           Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>,
+          Kokkos::Rank<6, Iterate::Right, Iterate::Left>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3330,9 +3470,19 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type = typename Kokkos::MDRangePolicy<
           ExecSpace, Kokkos::Rank<6, Iterate::Right, Iterate::Right>,
           Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>,
+          Kokkos::Rank<6, Iterate::Right, Iterate::Right>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3683,9 +3833,18 @@ struct TestMDRange_6D_NegIdx {
   static void test_6D_negidx(const int N0, const int N1, const int N2,
                              const int N3, const int N4, const int N5) {
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<256, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
diff --git a/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp b/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp
index 777f91aea3e560981d5dde05767f1726d8a1542f..b38871afaaf6a277f6080e34f1a81aac31f6fb93 100644
--- a/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp
+++ b/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp
@@ -601,7 +601,8 @@ TEST(TEST_CATEGORY, mathematical_functions_power_functions) {
   do_test_math_binary_function<TEST_EXECSPACE, kk_hypot>(2.f, 3.f);
   do_test_math_binary_function<TEST_EXECSPACE, kk_hypot>(2., 3.);
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
-#if !(defined(KOKKOS_ARCH_POWER8) || defined(KOKKOS_ARCH_POWER9))  // FIXME
+// FIXME: fails with gcc on Power platforms
+#if !(defined(KOKKOS_ARCH_POWER8) || defined(KOKKOS_ARCH_POWER9))
   do_test_math_binary_function<TEST_EXECSPACE, kk_hypot>(2.l, 3.l);
 #endif
 #endif
@@ -668,7 +669,13 @@ TEST(TEST_CATEGORY, mathematical_functions_exponential_functions) {
   TEST_MATH_FUNCTION(log10)({1234.l, 567.l, 89.l, .003l});
 #endif
 
+// FIXME_OPENMPTARGET FIXME_AMD
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) &&                           \
+    (defined(KOKKOS_ARCH_VEGA906) || defined(KOKKOS_ARCH_VEGA908) || \
+     defined(KOKKOS_ARCH_VEGA90A))
+
   TEST_MATH_FUNCTION(log2)({1, 23, 456, 7890});
+#endif
   TEST_MATH_FUNCTION(log2)({1l, 23l, 456l, 7890l});
   TEST_MATH_FUNCTION(log2)({1ll, 23ll, 456ll, 7890ll});
   TEST_MATH_FUNCTION(log2)({1u, 23u, 456u, 7890u});
@@ -869,3 +876,69 @@ TEST(TEST_CATEGORY,
 #endif
 #endif
 }
+
+template <class Space>
+struct TestAbsoluteValueFunction {
+  TestAbsoluteValueFunction() { run(); }
+  void run() const {
+    int errors = 0;
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<Space>(0, 1), *this, errors);
+    ASSERT_EQ(errors, 0);
+  }
+  KOKKOS_FUNCTION void operator()(int, int& e) const {
+    using Kokkos::Experimental::abs;
+    if (abs(1) != 1 || abs(-1) != 1) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(int)\n");
+    }
+    if (abs(2l) != 2l || abs(-2l) != 2l) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(long int)\n");
+    }
+    if (abs(3ll) != 3ll || abs(-3ll) != 3ll) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(long long int)\n");
+    }
+    if (abs(4.f) != 4.f || abs(-4.f) != 4.f) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(float)\n");
+    }
+    if (abs(5.) != 5. || abs(-5.) != 5.) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(double)\n");
+    }
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+    if (abs(6.l) != 6.l || abs(-6.l) != 6.l) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(long double)\n");
+    }
+#endif
+    // special values
+    using Kokkos::Experimental::isinf;
+    using Kokkos::Experimental::isnan;
+    if (abs(-0.) != 0.
+    // WORKAROUND icpx changing default FP model when optimization level is >= 1
+    // using -fp-model=precise works too
+#ifndef __INTEL_LLVM_COMPILER
+        || !isinf(abs(-INFINITY)) || !isnan(abs(-NAN))
+#endif
+    ) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "failed abs(floating_point) special values\n");
+    }
+
+    static_assert(std::is_same<decltype(abs(1)), int>::value, "");
+    static_assert(std::is_same<decltype(abs(2l)), long>::value, "");
+    static_assert(std::is_same<decltype(abs(3ll)), long long>::value, "");
+    static_assert(std::is_same<decltype(abs(4.f)), float>::value, "");
+    static_assert(std::is_same<decltype(abs(5.)), double>::value, "");
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+    static_assert(std::is_same<decltype(abs(6.l)), long double>::value, "");
+#endif
+  }
+};
+
+TEST(TEST_CATEGORY, mathematical_functions_absolute_value) {
+  TestAbsoluteValueFunction<TEST_EXECSPACE>();
+}
diff --git a/packages/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp b/packages/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2d9b4db6bdef50c48a7010d907fb9abf02e05c35
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp
@@ -0,0 +1,1895 @@
+#include <fstream>
+#include <gtest/gtest.h>
+#include "Kokkos_Core.hpp"
+
+namespace Test {
+
+struct TestLargeArgTag {};
+struct TestRealErfcxTag {};
+
+template <class ExecSpace>
+struct TestExponentialIntergral1Function {
+  using ViewType     = Kokkos::View<double*, ExecSpace>;
+  using HostViewType = Kokkos::View<double*, Kokkos::HostSpace>;
+
+  ViewType d_x, d_expint;
+  typename ViewType::HostMirror h_x, h_expint;
+  HostViewType h_ref;
+
+  void testit() {
+    using Kokkos::Experimental::fabs;
+    using Kokkos::Experimental::infinity;
+
+    d_x      = ViewType("d_x", 15);
+    d_expint = ViewType("d_expint", 15);
+    h_x      = Kokkos::create_mirror_view(d_x);
+    h_expint = Kokkos::create_mirror_view(d_expint);
+    h_ref    = HostViewType("h_ref", 15);
+
+    // Generate test inputs
+    h_x(0)  = -0.2;
+    h_x(1)  = 0.0;
+    h_x(2)  = 0.2;
+    h_x(3)  = 0.8;
+    h_x(4)  = 1.6;
+    h_x(5)  = 5.1;
+    h_x(6)  = 0.01;
+    h_x(7)  = 0.001;
+    h_x(8)  = 1.0;
+    h_x(9)  = 1.001;
+    h_x(10) = 1.01;
+    h_x(11) = 1.1;
+    h_x(12) = 7.2;
+    h_x(13) = 10.3;
+    h_x(14) = 15.4;
+    Kokkos::deep_copy(d_x, h_x);
+
+    // Call exponential integral function
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, 15), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_expint, d_expint);
+
+    // Reference values computed with Octave
+    h_ref(0)  = -infinity<double>::value;  // x(0)=-0.2
+    h_ref(1)  = infinity<double>::value;   // x(1)= 0.0
+    h_ref(2)  = 1.222650544183893e+00;     // x(2) =0.2
+    h_ref(3)  = 3.105965785455429e-01;     // x(3) =0.8
+    h_ref(4)  = 8.630833369753976e-02;     // x(4) =1.6
+    h_ref(5)  = 1.021300107861738e-03;     // x(5) =5.1
+    h_ref(6)  = 4.037929576538113e+00;     // x(6) =0.01
+    h_ref(7)  = 6.331539364136149e+00;     // x(7) =0.001
+    h_ref(8)  = 2.193839343955205e-01;     // x(8) =1.0
+    h_ref(9)  = 2.190164225274689e-01;     // x(9) =1.001
+    h_ref(10) = 2.157416237944899e-01;     // x(10)=1.01
+    h_ref(11) = 1.859909045360401e-01;     // x(11)=1.1
+    h_ref(12) = 9.218811688716196e-05;     // x(12)=7.2
+    h_ref(13) = 2.996734771597901e-06;     // x(13)=10.3
+    h_ref(14) = 1.254522935050609e-08;     // x(14)=15.4
+
+    EXPECT_EQ(h_ref(0), h_expint(0));
+    EXPECT_EQ(h_ref(1), h_expint(1));
+    for (int i = 2; i < 15; i++) {
+      EXPECT_LE(std::abs(h_expint(i) - h_ref(i)), std::abs(h_ref(i)) * 1e-15);
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_expint(i) = Kokkos::Experimental::expint1(d_x(i));
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexErrorFunction {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+  using DblViewType     = Kokkos::View<double*, ExecSpace>;
+  using DblHostViewType = Kokkos::View<double*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_erf, d_erfcx;
+  typename ViewType::HostMirror h_z, h_erf, h_erfcx;
+  HostViewType h_ref_erf, h_ref_erfcx;
+
+  DblViewType d_x, d_erfcx_dbl;
+  typename DblViewType::HostMirror h_x, h_erfcx_dbl;
+  DblHostViewType h_ref_erfcx_dbl;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    d_z         = ViewType("d_z", 52);
+    d_erf       = ViewType("d_erf", 52);
+    d_erfcx     = ViewType("d_erfcx", 52);
+    h_z         = Kokkos::create_mirror_view(d_z);
+    h_erf       = Kokkos::create_mirror_view(d_erf);
+    h_erfcx     = Kokkos::create_mirror_view(d_erfcx);
+    h_ref_erf   = HostViewType("h_ref_erf", 52);
+    h_ref_erfcx = HostViewType("h_ref_erfcx", 52);
+
+    d_x             = DblViewType("d_x", 6);
+    d_erfcx_dbl     = DblViewType("d_erfcx_dbl", 6);
+    h_x             = Kokkos::create_mirror_view(d_x);
+    h_erfcx_dbl     = Kokkos::create_mirror_view(d_erfcx_dbl);
+    h_ref_erfcx_dbl = DblHostViewType("h_ref_erfcx_dbl", 6);
+
+    // Generate test inputs
+    // abs(z)<=2
+    h_z(0)  = Kokkos::complex<double>(0.0011, 0);
+    h_z(1)  = Kokkos::complex<double>(-0.0011, 0);
+    h_z(2)  = Kokkos::complex<double>(1.4567, 0);
+    h_z(3)  = Kokkos::complex<double>(-1.4567, 0);
+    h_z(4)  = Kokkos::complex<double>(0, 0.0011);
+    h_z(5)  = Kokkos::complex<double>(0, -0.0011);
+    h_z(6)  = Kokkos::complex<double>(0, 1.4567);
+    h_z(7)  = Kokkos::complex<double>(0, -1.4567);
+    h_z(8)  = Kokkos::complex<double>(1.4567, 0.0011);
+    h_z(9)  = Kokkos::complex<double>(1.4567, -0.0011);
+    h_z(10) = Kokkos::complex<double>(-1.4567, 0.0011);
+    h_z(11) = Kokkos::complex<double>(-1.4567, -0.0011);
+    h_z(12) = Kokkos::complex<double>(1.4567, 0.5942);
+    h_z(13) = Kokkos::complex<double>(1.4567, -0.5942);
+    h_z(14) = Kokkos::complex<double>(-1.4567, 0.5942);
+    h_z(15) = Kokkos::complex<double>(-1.4567, -0.5942);
+    h_z(16) = Kokkos::complex<double>(0.0011, 0.5942);
+    h_z(17) = Kokkos::complex<double>(0.0011, -0.5942);
+    h_z(18) = Kokkos::complex<double>(-0.0011, 0.5942);
+    h_z(19) = Kokkos::complex<double>(-0.0011, -0.5942);
+    h_z(20) = Kokkos::complex<double>(0.0011, 0.0051);
+    h_z(21) = Kokkos::complex<double>(0.0011, -0.0051);
+    h_z(22) = Kokkos::complex<double>(-0.0011, 0.0051);
+    h_z(23) = Kokkos::complex<double>(-0.0011, -0.0051);
+    // abs(z)>2.0 and x>1
+    h_z(24) = Kokkos::complex<double>(3.5, 0.0011);
+    h_z(25) = Kokkos::complex<double>(3.5, -0.0011);
+    h_z(26) = Kokkos::complex<double>(-3.5, 0.0011);
+    h_z(27) = Kokkos::complex<double>(-3.5, -0.0011);
+    h_z(28) = Kokkos::complex<double>(3.5, 9.7);
+    h_z(29) = Kokkos::complex<double>(3.5, -9.7);
+    h_z(30) = Kokkos::complex<double>(-3.5, 9.7);
+    h_z(31) = Kokkos::complex<double>(-3.5, -9.7);
+    h_z(32) = Kokkos::complex<double>(18.9, 9.7);
+    h_z(33) = Kokkos::complex<double>(18.9, -9.7);
+    h_z(34) = Kokkos::complex<double>(-18.9, 9.7);
+    h_z(35) = Kokkos::complex<double>(-18.9, -9.7);
+    // abs(z)>2.0 and 0<=x<=1 and abs(y)<6
+    h_z(36) = Kokkos::complex<double>(0.85, 3.5);
+    h_z(37) = Kokkos::complex<double>(0.85, -3.5);
+    h_z(38) = Kokkos::complex<double>(-0.85, 3.5);
+    h_z(39) = Kokkos::complex<double>(-0.85, -3.5);
+    h_z(40) = Kokkos::complex<double>(0.0011, 3.5);
+    h_z(41) = Kokkos::complex<double>(0.0011, -3.5);
+    h_z(42) = Kokkos::complex<double>(-0.0011, 3.5);
+    h_z(43) = Kokkos::complex<double>(-0.0011, -3.5);
+    // abs(z)>2.0 and 0<=x<=1 and abs(y)>=6
+    h_z(44) = Kokkos::complex<double>(0.85, 7.5);
+    h_z(45) = Kokkos::complex<double>(0.85, -7.5);
+    h_z(46) = Kokkos::complex<double>(-0.85, 7.5);
+    h_z(47) = Kokkos::complex<double>(-0.85, -7.5);
+    h_z(48) = Kokkos::complex<double>(0.85, 19.7);
+    h_z(49) = Kokkos::complex<double>(0.85, -19.7);
+    h_z(50) = Kokkos::complex<double>(-0.85, 19.7);
+    h_z(51) = Kokkos::complex<double>(-0.85, -19.7);
+
+    h_x(0) = -infinity<double>::value;
+    h_x(1) = -1.2;
+    h_x(2) = 0.0;
+    h_x(3) = 1.2;
+    h_x(4) = 10.5;
+    h_x(5) = infinity<double>::value;
+
+    Kokkos::deep_copy(d_z, h_z);
+    Kokkos::deep_copy(d_x, h_x);
+
+    // Call erf and erfcx functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, 52), *this);
+    Kokkos::fence();
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, TestRealErfcxTag>(0, 1),
+                         *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_erf, d_erf);
+    Kokkos::deep_copy(h_erfcx, d_erfcx);
+    Kokkos::deep_copy(h_erfcx_dbl, d_erfcx_dbl);
+
+    // Reference values computed with Octave
+    h_ref_erf(0) = Kokkos::complex<double>(0.001241216583181022, 0);
+    h_ref_erf(1) = Kokkos::complex<double>(-0.001241216583181022, 0);
+    h_ref_erf(2) = Kokkos::complex<double>(0.9606095744865353, 0);
+    h_ref_erf(3) = Kokkos::complex<double>(-0.9606095744865353, 0);
+    h_ref_erf(4) = Kokkos::complex<double>(0, 0.001241217584429469);
+    h_ref_erf(5) = Kokkos::complex<double>(0, -0.001241217584429469);
+    h_ref_erf(6) = Kokkos::complex<double>(0, 4.149756424218223);
+    h_ref_erf(7) = Kokkos::complex<double>(0, -4.149756424218223);
+    h_ref_erf(8) =
+        Kokkos::complex<double>(0.960609812745064, 0.0001486911741082233);
+    h_ref_erf(9) =
+        Kokkos::complex<double>(0.960609812745064, -0.0001486911741082233);
+    h_ref_erf(10) =
+        Kokkos::complex<double>(-0.960609812745064, 0.0001486911741082233);
+    h_ref_erf(11) =
+        Kokkos::complex<double>(-0.960609812745064, -0.0001486911741082233);
+    h_ref_erf(12) =
+        Kokkos::complex<double>(1.02408827958197, 0.04828570635603527);
+    h_ref_erf(13) =
+        Kokkos::complex<double>(1.02408827958197, -0.04828570635603527);
+    h_ref_erf(14) =
+        Kokkos::complex<double>(-1.02408827958197, 0.04828570635603527);
+    h_ref_erf(15) =
+        Kokkos::complex<double>(-1.02408827958197, -0.04828570635603527);
+    h_ref_erf(16) =
+        Kokkos::complex<double>(0.001766791817179109, 0.7585038120712589);
+    h_ref_erf(17) =
+        Kokkos::complex<double>(0.001766791817179109, -0.7585038120712589);
+    h_ref_erf(18) =
+        Kokkos::complex<double>(-0.001766791817179109, 0.7585038120712589);
+    h_ref_erf(19) =
+        Kokkos::complex<double>(-0.001766791817179109, -0.7585038120712589);
+    h_ref_erf(20) =
+        Kokkos::complex<double>(0.001241248867618165, 0.005754776682713324);
+    h_ref_erf(21) =
+        Kokkos::complex<double>(0.001241248867618165, -0.005754776682713324);
+    h_ref_erf(22) =
+        Kokkos::complex<double>(-0.001241248867618165, 0.005754776682713324);
+    h_ref_erf(23) =
+        Kokkos::complex<double>(-0.001241248867618165, -0.005754776682713324);
+    h_ref_erf(24) =
+        Kokkos::complex<double>(0.9999992569244941, 5.939313159932013e-09);
+    h_ref_erf(25) =
+        Kokkos::complex<double>(0.9999992569244941, -5.939313159932013e-09);
+    h_ref_erf(26) =
+        Kokkos::complex<double>(-0.9999992569244941, 5.939313159932013e-09);
+    h_ref_erf(27) =
+        Kokkos::complex<double>(-0.9999992569244941, -5.939313159932013e-09);
+    h_ref_erf(28) =
+        Kokkos::complex<double>(-1.915595842013002e+34, 1.228821279117683e+32);
+    h_ref_erf(29) =
+        Kokkos::complex<double>(-1.915595842013002e+34, -1.228821279117683e+32);
+    h_ref_erf(30) =
+        Kokkos::complex<double>(1.915595842013002e+34, 1.228821279117683e+32);
+    h_ref_erf(31) =
+        Kokkos::complex<double>(1.915595842013002e+34, -1.228821279117683e+32);
+    h_ref_erf(32) = Kokkos::complex<double>(1, 5.959897539826596e-117);
+    h_ref_erf(33) = Kokkos::complex<double>(1, -5.959897539826596e-117);
+    h_ref_erf(34) = Kokkos::complex<double>(-1, 5.959897539826596e-117);
+    h_ref_erf(35) = Kokkos::complex<double>(-1, -5.959897539826596e-117);
+    h_ref_erf(36) =
+        Kokkos::complex<double>(-9211.077162784413, 13667.93825589455);
+    h_ref_erf(37) =
+        Kokkos::complex<double>(-9211.077162784413, -13667.93825589455);
+    h_ref_erf(38) =
+        Kokkos::complex<double>(9211.077162784413, 13667.93825589455);
+    h_ref_erf(39) =
+        Kokkos::complex<double>(9211.077162784413, -13667.93825589455);
+    h_ref_erf(40) = Kokkos::complex<double>(259.38847811225, 35281.28906479814);
+    h_ref_erf(41) =
+        Kokkos::complex<double>(259.38847811225, -35281.28906479814);
+    h_ref_erf(42) =
+        Kokkos::complex<double>(-259.38847811225, 35281.28906479814);
+    h_ref_erf(43) =
+        Kokkos::complex<double>(-259.38847811225, -35281.28906479814);
+    h_ref_erf(44) =
+        Kokkos::complex<double>(6.752085728270252e+21, 9.809477366939276e+22);
+    h_ref_erf(45) =
+        Kokkos::complex<double>(6.752085728270252e+21, -9.809477366939276e+22);
+    h_ref_erf(46) =
+        Kokkos::complex<double>(-6.752085728270252e+21, 9.809477366939276e+22);
+    h_ref_erf(47) =
+        Kokkos::complex<double>(-6.752085728270252e+21, -9.809477366939276e+22);
+    h_ref_erf(48) =
+        Kokkos::complex<double>(4.37526734926942e+166, -2.16796709605852e+166);
+    h_ref_erf(49) =
+        Kokkos::complex<double>(4.37526734926942e+166, 2.16796709605852e+166);
+    h_ref_erf(50) =
+        Kokkos::complex<double>(-4.37526734926942e+166, -2.16796709605852e+166);
+    h_ref_erf(51) =
+        Kokkos::complex<double>(-4.37526734926942e+166, 2.16796709605852e+166);
+
+    h_ref_erfcx(0) = Kokkos::complex<double>(0.9987599919156778, 0);
+    h_ref_erfcx(1) = Kokkos::complex<double>(1.001242428085786, 0);
+    h_ref_erfcx(2) = Kokkos::complex<double>(0.3288157848563544, 0);
+    h_ref_erfcx(3) = Kokkos::complex<double>(16.36639786516915, 0);
+    h_ref_erfcx(4) =
+        Kokkos::complex<double>(0.999998790000732, -0.001241216082557101);
+    h_ref_erfcx(5) =
+        Kokkos::complex<double>(0.999998790000732, 0.001241216082557101);
+    h_ref_erfcx(6) =
+        Kokkos::complex<double>(0.1197948131677216, -0.4971192955307743);
+    h_ref_erfcx(7) =
+        Kokkos::complex<double>(0.1197948131677216, 0.4971192955307743);
+    h_ref_erfcx(8) =
+        Kokkos::complex<double>(0.3288156873503045, -0.0001874479383970247);
+    h_ref_erfcx(9) =
+        Kokkos::complex<double>(0.3288156873503045, 0.0001874479383970247);
+    h_ref_erfcx(10) =
+        Kokkos::complex<double>(16.36629202874158, -0.05369111060785572);
+    h_ref_erfcx(11) =
+        Kokkos::complex<double>(16.36629202874158, 0.05369111060785572);
+    h_ref_erfcx(12) =
+        Kokkos::complex<double>(0.3020886508118801, -0.09424097887578842);
+    h_ref_erfcx(13) =
+        Kokkos::complex<double>(0.3020886508118801, 0.09424097887578842);
+    h_ref_erfcx(14) =
+        Kokkos::complex<double>(-2.174707722732267, -11.67259764091796);
+    h_ref_erfcx(15) =
+        Kokkos::complex<double>(-2.174707722732267, 11.67259764091796);
+    h_ref_erfcx(16) =
+        Kokkos::complex<double>(0.7019810779371267, -0.5319516793968513);
+    h_ref_erfcx(17) =
+        Kokkos::complex<double>(0.7019810779371267, 0.5319516793968513);
+    h_ref_erfcx(18) =
+        Kokkos::complex<double>(0.7030703366403597, -0.5337884198542978);
+    h_ref_erfcx(19) =
+        Kokkos::complex<double>(0.7030703366403597, 0.5337884198542978);
+    h_ref_erfcx(20) =
+        Kokkos::complex<double>(0.9987340467266177, -0.005743428170378673);
+    h_ref_erfcx(21) =
+        Kokkos::complex<double>(0.9987340467266177, 0.005743428170378673);
+    h_ref_erfcx(22) =
+        Kokkos::complex<double>(1.001216353762532, -0.005765867613873103);
+    h_ref_erfcx(23) =
+        Kokkos::complex<double>(1.001216353762532, 0.005765867613873103);
+    h_ref_erfcx(24) =
+        Kokkos::complex<double>(0.1552936427089241, -4.545593205871305e-05);
+    h_ref_erfcx(25) =
+        Kokkos::complex<double>(0.1552936427089241, 4.545593205871305e-05);
+    h_ref_erfcx(26) =
+        Kokkos::complex<double>(417949.5262869648, -3218.276197742372);
+    h_ref_erfcx(27) =
+        Kokkos::complex<double>(417949.5262869648, 3218.276197742372);
+    h_ref_erfcx(28) =
+        Kokkos::complex<double>(0.01879467905925653, -0.0515934271478583);
+    h_ref_erfcx(29) =
+        Kokkos::complex<double>(0.01879467905925653, 0.0515934271478583);
+    h_ref_erfcx(30) =
+        Kokkos::complex<double>(-0.01879467905925653, -0.0515934271478583);
+    h_ref_erfcx(31) =
+        Kokkos::complex<double>(-0.01879467905925653, 0.0515934271478583);
+    h_ref_erfcx(32) =
+        Kokkos::complex<double>(0.02362328821805, -0.01209735551897239);
+    h_ref_erfcx(33) =
+        Kokkos::complex<double>(0.02362328821805, 0.01209735551897239);
+    h_ref_erfcx(34) = Kokkos::complex<double>(-2.304726099084567e+114,
+                                              -2.942443198107089e+114);
+    h_ref_erfcx(35) = Kokkos::complex<double>(-2.304726099084567e+114,
+                                              2.942443198107089e+114);
+    h_ref_erfcx(36) =
+        Kokkos::complex<double>(0.04174017523145063, -0.1569865319886248);
+    h_ref_erfcx(37) =
+        Kokkos::complex<double>(0.04174017523145063, 0.1569865319886248);
+    h_ref_erfcx(38) =
+        Kokkos::complex<double>(-0.04172154858670504, -0.156980085534407);
+    h_ref_erfcx(39) =
+        Kokkos::complex<double>(-0.04172154858670504, 0.156980085534407);
+    h_ref_erfcx(40) =
+        Kokkos::complex<double>(6.355803055239174e-05, -0.1688298297427782);
+    h_ref_erfcx(41) =
+        Kokkos::complex<double>(6.355803055239174e-05, 0.1688298297427782);
+    h_ref_erfcx(42) =
+        Kokkos::complex<double>(-5.398806789669434e-05, -0.168829903432947);
+    h_ref_erfcx(43) =
+        Kokkos::complex<double>(-5.398806789669434e-05, 0.168829903432947);
+    h_ref_erfcx(44) =
+        Kokkos::complex<double>(0.008645103282302355, -0.07490521021566741);
+    h_ref_erfcx(45) =
+        Kokkos::complex<double>(0.008645103282302355, 0.07490521021566741);
+    h_ref_erfcx(46) =
+        Kokkos::complex<double>(-0.008645103282302355, -0.07490521021566741);
+    h_ref_erfcx(47) =
+        Kokkos::complex<double>(-0.008645103282302355, 0.07490521021566741);
+    h_ref_erfcx(48) =
+        Kokkos::complex<double>(0.001238176693606428, -0.02862247416909219);
+    h_ref_erfcx(49) =
+        Kokkos::complex<double>(0.001238176693606428, 0.02862247416909219);
+    h_ref_erfcx(50) =
+        Kokkos::complex<double>(-0.001238176693606428, -0.02862247416909219);
+    h_ref_erfcx(51) =
+        Kokkos::complex<double>(-0.001238176693606428, 0.02862247416909219);
+
+    h_ref_erfcx_dbl(0) = infinity<double>::value;
+    h_ref_erfcx_dbl(1) = 8.062854217063865e+00;
+    h_ref_erfcx_dbl(2) = 1.0;
+    h_ref_erfcx_dbl(3) = 3.785374169292397e-01;
+    h_ref_erfcx_dbl(4) = 5.349189974656411e-02;
+    h_ref_erfcx_dbl(5) = 0.0;
+
+    for (int i = 0; i < 52; i++) {
+      EXPECT_LE(Kokkos::abs(h_erf(i) - h_ref_erf(i)),
+                Kokkos::abs(h_ref_erf(i)) * 1e-13);
+    }
+
+    for (int i = 0; i < 52; i++) {
+      EXPECT_LE(Kokkos::abs(h_erfcx(i) - h_ref_erfcx(i)),
+                Kokkos::abs(h_ref_erfcx(i)) * 1e-13);
+    }
+
+    EXPECT_EQ(h_erfcx_dbl(0), h_ref_erfcx_dbl(0));
+    EXPECT_EQ(h_erfcx_dbl(5), h_ref_erfcx_dbl(5));
+    for (int i = 1; i < 5; i++) {
+      EXPECT_LE(std::abs(h_erfcx_dbl(i) - h_ref_erfcx_dbl(i)),
+                std::abs(h_ref_erfcx_dbl(i)) * 1e-13);
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_erf(i)   = Kokkos::Experimental::erf(d_z(i));
+    d_erfcx(i) = Kokkos::Experimental::erfcx(d_z(i));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TestRealErfcxTag&, const int& /*i*/) const {
+    d_erfcx_dbl(0) = Kokkos::Experimental::erfcx(d_x(0));
+    d_erfcx_dbl(1) = Kokkos::Experimental::erfcx(d_x(1));
+    d_erfcx_dbl(2) = Kokkos::Experimental::erfcx(d_x(2));
+    d_erfcx_dbl(3) = Kokkos::Experimental::erfcx(d_x(3));
+    d_erfcx_dbl(4) = Kokkos::Experimental::erfcx(d_x(4));
+    d_erfcx_dbl(5) = Kokkos::Experimental::erfcx(d_x(5));
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexBesselJ0Y0Function {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_cbj0, d_cby0;
+  typename ViewType::HostMirror h_z, h_cbj0, h_cby0;
+  HostViewType h_ref_cbj0, h_ref_cby0;
+
+  ViewType d_z_large, d_cbj0_large, d_cby0_large;
+  typename ViewType::HostMirror h_z_large, h_cbj0_large, h_cby0_large;
+  HostViewType h_ref_cbj0_large, h_ref_cby0_large;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    int N      = 25;
+    d_z        = ViewType("d_z", N);
+    d_cbj0     = ViewType("d_cbj0", N);
+    d_cby0     = ViewType("d_cby0", N);
+    h_z        = Kokkos::create_mirror_view(d_z);
+    h_cbj0     = Kokkos::create_mirror_view(d_cbj0);
+    h_cby0     = Kokkos::create_mirror_view(d_cby0);
+    h_ref_cbj0 = HostViewType("h_ref_cbj0", N);
+    h_ref_cby0 = HostViewType("h_ref_cby0", N);
+
+    // Generate test inputs
+    h_z(0) = Kokkos::complex<double>(0.0, 0.0);
+    // abs(z)<=25
+    h_z(1)  = Kokkos::complex<double>(3.0, 2.0);
+    h_z(2)  = Kokkos::complex<double>(3.0, -2.0);
+    h_z(3)  = Kokkos::complex<double>(-3.0, 2.0);
+    h_z(4)  = Kokkos::complex<double>(-3.0, -2.0);
+    h_z(5)  = Kokkos::complex<double>(23.0, 10.0);
+    h_z(6)  = Kokkos::complex<double>(23.0, -10.0);
+    h_z(7)  = Kokkos::complex<double>(-23.0, 10.0);
+    h_z(8)  = Kokkos::complex<double>(-23.0, -10.0);
+    h_z(9)  = Kokkos::complex<double>(3.0, 0.0);
+    h_z(10) = Kokkos::complex<double>(-3.0, 0.0);
+    h_z(11) = Kokkos::complex<double>(23.0, 0.0);
+    h_z(12) = Kokkos::complex<double>(-23.0, 0.0);
+    // abs(z)>25
+    h_z(13) = Kokkos::complex<double>(28.0, 10.0);
+    h_z(14) = Kokkos::complex<double>(28.0, -10.0);
+    h_z(15) = Kokkos::complex<double>(-28.0, 10.0);
+    h_z(16) = Kokkos::complex<double>(-28.0, -10.0);
+    h_z(17) = Kokkos::complex<double>(60.0, 10.0);
+    h_z(18) = Kokkos::complex<double>(60.0, -10.0);
+    h_z(19) = Kokkos::complex<double>(-60.0, 10.0);
+    h_z(20) = Kokkos::complex<double>(-60.0, -10.0);
+    h_z(21) = Kokkos::complex<double>(28.0, 0.0);
+    h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
+    h_z(23) = Kokkos::complex<double>(60.0, 0.0);
+    h_z(24) = Kokkos::complex<double>(-60.0, 0.0);
+
+    Kokkos::deep_copy(d_z, h_z);
+
+    // Call Bessel functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbj0, d_cbj0);
+    Kokkos::deep_copy(h_cby0, d_cby0);
+
+    // Reference values computed with Octave
+    h_ref_cbj0(0) = Kokkos::complex<double>(1.000000000000000e+00, 0);
+    h_ref_cbj0(1) =
+        Kokkos::complex<double>(-1.249234879607422e+00, -9.479837920577351e-01);
+    h_ref_cbj0(2) =
+        Kokkos::complex<double>(-1.249234879607422e+00, +9.479837920577351e-01);
+    h_ref_cbj0(3) =
+        Kokkos::complex<double>(-1.249234879607422e+00, +9.479837920577351e-01);
+    h_ref_cbj0(4) =
+        Kokkos::complex<double>(-1.249234879607422e+00, -9.479837920577351e-01);
+    h_ref_cbj0(5) =
+        Kokkos::complex<double>(-1.602439981218195e+03, +7.230667451989807e+02);
+    h_ref_cbj0(6) =
+        Kokkos::complex<double>(-1.602439981218195e+03, -7.230667451989807e+02);
+    h_ref_cbj0(7) =
+        Kokkos::complex<double>(-1.602439981218195e+03, -7.230667451989807e+02);
+    h_ref_cbj0(8) =
+        Kokkos::complex<double>(-1.602439981218195e+03, +7.230667451989807e+02);
+    h_ref_cbj0(9) = Kokkos::complex<double>(-2.600519549019335e-01, 0);
+    h_ref_cbj0(10) =
+        Kokkos::complex<double>(-2.600519549019335e-01, +9.951051106466461e-18);
+    h_ref_cbj0(11) = Kokkos::complex<double>(-1.624127813134866e-01, 0);
+    h_ref_cbj0(12) =
+        Kokkos::complex<double>(-1.624127813134866e-01, -1.387778780781446e-17);
+    h_ref_cbj0(13) =
+        Kokkos::complex<double>(-1.012912188513958e+03, -1.256239636146142e+03);
+    h_ref_cbj0(14) =
+        Kokkos::complex<double>(-1.012912188513958e+03, +1.256239636146142e+03);
+    h_ref_cbj0(15) =
+        Kokkos::complex<double>(-1.012912188513958e+03, +1.256239636146142e+03);
+    h_ref_cbj0(16) =
+        Kokkos::complex<double>(-1.012912188513958e+03, -1.256239636146142e+03);
+    h_ref_cbj0(17) =
+        Kokkos::complex<double>(-1.040215134669324e+03, -4.338202386810095e+02);
+    h_ref_cbj0(18) =
+        Kokkos::complex<double>(-1.040215134669324e+03, +4.338202386810095e+02);
+    h_ref_cbj0(19) =
+        Kokkos::complex<double>(-1.040215134669324e+03, +4.338202386810095e+02);
+    h_ref_cbj0(20) =
+        Kokkos::complex<double>(-1.040215134669324e+03, -4.338202386810095e+02);
+    h_ref_cbj0(21) = Kokkos::complex<double>(-7.315701054899962e-02, 0);
+    h_ref_cbj0(22) =
+        Kokkos::complex<double>(-7.315701054899962e-02, -6.938893903907228e-18);
+    h_ref_cbj0(23) = Kokkos::complex<double>(-9.147180408906189e-02, 0);
+    h_ref_cbj0(24) =
+        Kokkos::complex<double>(-9.147180408906189e-02, +1.387778780781446e-17);
+
+    h_ref_cby0(0) = Kokkos::complex<double>(-infinity<double>::value, 0);
+    h_ref_cby0(1) =
+        Kokkos::complex<double>(1.000803196554890e+00, -1.231441609303427e+00);
+    h_ref_cby0(2) =
+        Kokkos::complex<double>(1.000803196554890e+00, +1.231441609303427e+00);
+    h_ref_cby0(3) =
+        Kokkos::complex<double>(-8.951643875605797e-01, -1.267028149911417e+00);
+    h_ref_cby0(4) =
+        Kokkos::complex<double>(-8.951643875605797e-01, +1.267028149911417e+00);
+    h_ref_cby0(5) =
+        Kokkos::complex<double>(-7.230667452992603e+02, -1.602439974000479e+03);
+    h_ref_cby0(6) =
+        Kokkos::complex<double>(-7.230667452992603e+02, +1.602439974000479e+03);
+    h_ref_cby0(7) =
+        Kokkos::complex<double>(7.230667450987011e+02, -1.602439988435912e+03);
+    h_ref_cby0(8) =
+        Kokkos::complex<double>(7.230667450987011e+02, +1.602439988435912e+03);
+    h_ref_cby0(9) = Kokkos::complex<double>(3.768500100127903e-01, 0);
+    h_ref_cby0(10) =
+        Kokkos::complex<double>(3.768500100127903e-01, -5.201039098038670e-01);
+    h_ref_cby0(11) = Kokkos::complex<double>(-3.598179027370283e-02, 0);
+    h_ref_cby0(12) =
+        Kokkos::complex<double>(-3.598179027370282e-02, -3.248255626269732e-01);
+    h_ref_cby0(13) =
+        Kokkos::complex<double>(1.256239642409530e+03, -1.012912186329053e+03);
+    h_ref_cby0(14) =
+        Kokkos::complex<double>(1.256239642409530e+03, +1.012912186329053e+03);
+    h_ref_cby0(15) =
+        Kokkos::complex<double>(-1.256239629882755e+03, -1.012912190698863e+03);
+    h_ref_cby0(16) =
+        Kokkos::complex<double>(-1.256239629882755e+03, +1.012912190698863e+03);
+    h_ref_cby0(17) =
+        Kokkos::complex<double>(4.338202411482646e+02, -1.040215130736213e+03);
+    h_ref_cby0(18) =
+        Kokkos::complex<double>(4.338202411482646e+02, +1.040215130736213e+03);
+    h_ref_cby0(19) =
+        Kokkos::complex<double>(-4.338202362137545e+02, -1.040215138602435e+03);
+    h_ref_cby0(20) =
+        Kokkos::complex<double>(-4.338202362137545e+02, +1.040215138602435e+03);
+    h_ref_cby0(21) = Kokkos::complex<double>(1.318364704235323e-01, 0);
+    h_ref_cby0(22) =
+        Kokkos::complex<double>(1.318364704235323e-01, -1.463140210979992e-01);
+    h_ref_cby0(23) = Kokkos::complex<double>(4.735895220944939e-02, 0);
+    h_ref_cby0(24) =
+        Kokkos::complex<double>(4.735895220944938e-02, -1.829436081781237e-01);
+
+    for (int i = 0; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cbj0(i) - h_ref_cbj0(i)),
+                Kokkos::abs(h_ref_cbj0(i)) * 1e-13);
+    }
+
+    EXPECT_EQ(h_ref_cby0(0), h_cby0(0));
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cby0(i) - h_ref_cby0(i)),
+                Kokkos::abs(h_ref_cby0(i)) * 1e-13);
+    }
+
+    ////Test large arguments
+    d_z_large        = ViewType("d_z_large", 6);
+    d_cbj0_large     = ViewType("d_cbj0_large", 6);
+    d_cby0_large     = ViewType("d_cby0_large", 6);
+    h_z_large        = Kokkos::create_mirror_view(d_z_large);
+    h_cbj0_large     = Kokkos::create_mirror_view(d_cbj0_large);
+    h_cby0_large     = Kokkos::create_mirror_view(d_cby0_large);
+    h_ref_cbj0_large = HostViewType("h_ref_cbj0_large", 2);
+    h_ref_cby0_large = HostViewType("h_ref_cby0_large", 2);
+
+    h_z_large(0) = Kokkos::complex<double>(10000.0, 100.0);
+    h_z_large(1) = Kokkos::complex<double>(10000.0, 100.0);
+    h_z_large(2) = Kokkos::complex<double>(10000.0, 100.0);
+    h_z_large(3) = Kokkos::complex<double>(-10000.0, 100.0);
+    h_z_large(4) = Kokkos::complex<double>(-10000.0, 100.0);
+    h_z_large(5) = Kokkos::complex<double>(-10000.0, 100.0);
+
+    Kokkos::deep_copy(d_z_large, h_z_large);
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, TestLargeArgTag>(0, 1),
+                         *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbj0_large, d_cbj0_large);
+    Kokkos::deep_copy(h_cby0_large, d_cby0_large);
+
+    h_ref_cbj0_large(0) =
+        Kokkos::complex<double>(-9.561811498244175e+40, -4.854995782103029e+40);
+    h_ref_cbj0_large(1) =
+        Kokkos::complex<double>(-9.561811498244175e+40, +4.854995782103029e+40);
+
+    h_ref_cby0_large(0) =
+        Kokkos::complex<double>(4.854995782103029e+40, -9.561811498244175e+40);
+    h_ref_cby0_large(1) =
+        Kokkos::complex<double>(-4.854995782103029e+40, -9.561811498244175e+40);
+
+    EXPECT_TRUE((Kokkos::abs(h_cbj0_large(0) - h_ref_cbj0_large(0)) <
+                 Kokkos::abs(h_ref_cbj0_large(0)) * 1e-12) &&
+                (Kokkos::abs(h_cbj0_large(0) - h_ref_cbj0_large(0)) >
+                 Kokkos::abs(h_ref_cbj0_large(0)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cbj0_large(1) - h_ref_cbj0_large(0)) >
+                Kokkos::abs(h_ref_cbj0_large(0)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cbj0_large(2) - h_ref_cbj0_large(0)) <
+                Kokkos::abs(h_ref_cbj0_large(0)) * 1e-13);
+    EXPECT_TRUE((Kokkos::abs(h_cbj0_large(3) - h_ref_cbj0_large(1)) <
+                 Kokkos::abs(h_ref_cbj0_large(1)) * 1e-12) &&
+                (Kokkos::abs(h_cbj0_large(3) - h_ref_cbj0_large(1)) >
+                 Kokkos::abs(h_ref_cbj0_large(1)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cbj0_large(4) - h_ref_cbj0_large(1)) >
+                Kokkos::abs(h_ref_cbj0_large(1)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cbj0_large(5) - h_ref_cbj0_large(1)) <
+                Kokkos::abs(h_ref_cbj0_large(1)) * 1e-13);
+
+    EXPECT_TRUE((Kokkos::abs(h_cby0_large(0) - h_ref_cby0_large(0)) <
+                 Kokkos::abs(h_ref_cby0_large(0)) * 1e-12) &&
+                (Kokkos::abs(h_cby0_large(0) - h_ref_cby0_large(0)) >
+                 Kokkos::abs(h_ref_cby0_large(0)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cby0_large(1) - h_ref_cby0_large(0)) >
+                Kokkos::abs(h_ref_cby0_large(0)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cby0_large(2) - h_ref_cby0_large(0)) <
+                Kokkos::abs(h_ref_cby0_large(0)) * 1e-13);
+    EXPECT_TRUE((Kokkos::abs(h_cby0_large(3) - h_ref_cby0_large(1)) <
+                 Kokkos::abs(h_ref_cby0_large(1)) * 1e-12) &&
+                (Kokkos::abs(h_cby0_large(3) - h_ref_cby0_large(1)) >
+                 Kokkos::abs(h_ref_cby0_large(1)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cby0_large(4) - h_ref_cby0_large(1)) >
+                Kokkos::abs(h_ref_cby0_large(1)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cby0_large(5) - h_ref_cby0_large(1)) <
+                Kokkos::abs(h_ref_cby0_large(1)) * 1e-13);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_cbj0(i) = Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+    d_cby0(i) = Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TestLargeArgTag&, const int& /*i*/) const {
+    d_cbj0_large(0) =
+        Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(0));
+    d_cbj0_large(1) =
+        Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(1), 11000, 3000);
+    d_cbj0_large(2) =
+        Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(2), 11000, 7500);
+    d_cbj0_large(3) =
+        Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(3));
+    d_cbj0_large(4) =
+        Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(4), 11000, 3000);
+    d_cbj0_large(5) =
+        Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(5), 11000, 7500);
+
+    d_cby0_large(0) =
+        Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(0));
+    d_cby0_large(1) =
+        Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(1), 11000, 3000);
+    d_cby0_large(2) =
+        Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(2), 11000, 7500);
+    d_cby0_large(3) =
+        Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(3));
+    d_cby0_large(4) =
+        Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(4), 11000, 3000);
+    d_cby0_large(5) =
+        Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(5), 11000, 7500);
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexBesselJ1Y1Function {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_cbj1, d_cby1;
+  typename ViewType::HostMirror h_z, h_cbj1, h_cby1;
+  HostViewType h_ref_cbj1, h_ref_cby1;
+
+  ViewType d_z_large, d_cbj1_large, d_cby1_large;
+  typename ViewType::HostMirror h_z_large, h_cbj1_large, h_cby1_large;
+  HostViewType h_ref_cbj1_large, h_ref_cby1_large;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    int N      = 25;
+    d_z        = ViewType("d_z", N);
+    d_cbj1     = ViewType("d_cbj1", N);
+    d_cby1     = ViewType("d_cby1", N);
+    h_z        = Kokkos::create_mirror_view(d_z);
+    h_cbj1     = Kokkos::create_mirror_view(d_cbj1);
+    h_cby1     = Kokkos::create_mirror_view(d_cby1);
+    h_ref_cbj1 = HostViewType("h_ref_cbj1", N);
+    h_ref_cby1 = HostViewType("h_ref_cby1", N);
+
+    // Generate test inputs
+    h_z(0) = Kokkos::complex<double>(0.0, 0.0);
+    // abs(z)<=25
+    h_z(1)  = Kokkos::complex<double>(3.0, 2.0);
+    h_z(2)  = Kokkos::complex<double>(3.0, -2.0);
+    h_z(3)  = Kokkos::complex<double>(-3.0, 2.0);
+    h_z(4)  = Kokkos::complex<double>(-3.0, -2.0);
+    h_z(5)  = Kokkos::complex<double>(23.0, 10.0);
+    h_z(6)  = Kokkos::complex<double>(23.0, -10.0);
+    h_z(7)  = Kokkos::complex<double>(-23.0, 10.0);
+    h_z(8)  = Kokkos::complex<double>(-23.0, -10.0);
+    h_z(9)  = Kokkos::complex<double>(3.0, 0.0);
+    h_z(10) = Kokkos::complex<double>(-3.0, 0.0);
+    h_z(11) = Kokkos::complex<double>(23.0, 0.0);
+    h_z(12) = Kokkos::complex<double>(-23.0, 0.0);
+    // abs(z)>25
+    h_z(13) = Kokkos::complex<double>(28.0, 10.0);
+    h_z(14) = Kokkos::complex<double>(28.0, -10.0);
+    h_z(15) = Kokkos::complex<double>(-28.0, 10.0);
+    h_z(16) = Kokkos::complex<double>(-28.0, -10.0);
+    h_z(17) = Kokkos::complex<double>(60.0, 10.0);
+    h_z(18) = Kokkos::complex<double>(60.0, -10.0);
+    h_z(19) = Kokkos::complex<double>(-60.0, 10.0);
+    h_z(20) = Kokkos::complex<double>(-60.0, -10.0);
+    h_z(21) = Kokkos::complex<double>(28.0, 0.0);
+    h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
+    h_z(23) = Kokkos::complex<double>(60.0, 0.0);
+    h_z(24) = Kokkos::complex<double>(-60.0, 0.0);
+
+    Kokkos::deep_copy(d_z, h_z);
+
+    // Call Bessel functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbj1, d_cbj1);
+    Kokkos::deep_copy(h_cby1, d_cby1);
+
+    // Reference values computed with Octave
+    h_ref_cbj1(0) = Kokkos::complex<double>(0, 0);
+    h_ref_cbj1(1) =
+        Kokkos::complex<double>(7.801488485792540e-01, -1.260982060238848e+00);
+    h_ref_cbj1(2) =
+        Kokkos::complex<double>(7.801488485792540e-01, +1.260982060238848e+00);
+    h_ref_cbj1(3) =
+        Kokkos::complex<double>(-7.801488485792543e-01, -1.260982060238848e+00);
+    h_ref_cbj1(4) =
+        Kokkos::complex<double>(-7.801488485792543e-01, +1.260982060238848e+00);
+    h_ref_cbj1(5) =
+        Kokkos::complex<double>(-7.469476253429664e+02, -1.576608505254311e+03);
+    h_ref_cbj1(6) =
+        Kokkos::complex<double>(-7.469476253429664e+02, +1.576608505254311e+03);
+    h_ref_cbj1(7) =
+        Kokkos::complex<double>(7.469476253429661e+02, -1.576608505254311e+03);
+    h_ref_cbj1(8) =
+        Kokkos::complex<double>(7.469476253429661e+02, +1.576608505254311e+03);
+    h_ref_cbj1(9) = Kokkos::complex<double>(3.390589585259365e-01, 0);
+    h_ref_cbj1(10) =
+        Kokkos::complex<double>(-3.390589585259365e-01, +3.373499138396203e-17);
+    h_ref_cbj1(11) = Kokkos::complex<double>(-3.951932188370151e-02, 0);
+    h_ref_cbj1(12) =
+        Kokkos::complex<double>(3.951932188370151e-02, +7.988560221984213e-18);
+    h_ref_cbj1(13) =
+        Kokkos::complex<double>(1.233147100257312e+03, -1.027302265904111e+03);
+    h_ref_cbj1(14) =
+        Kokkos::complex<double>(1.233147100257312e+03, +1.027302265904111e+03);
+    h_ref_cbj1(15) =
+        Kokkos::complex<double>(-1.233147100257312e+03, -1.027302265904111e+03);
+    h_ref_cbj1(16) =
+        Kokkos::complex<double>(-1.233147100257312e+03, +1.027302265904111e+03);
+    h_ref_cbj1(17) =
+        Kokkos::complex<double>(4.248029136732908e+02, -1.042364939115052e+03);
+    h_ref_cbj1(18) =
+        Kokkos::complex<double>(4.248029136732908e+02, +1.042364939115052e+03);
+    h_ref_cbj1(19) =
+        Kokkos::complex<double>(-4.248029136732909e+02, -1.042364939115052e+03);
+    h_ref_cbj1(20) =
+        Kokkos::complex<double>(-4.248029136732909e+02, +1.042364939115052e+03);
+    h_ref_cbj1(21) = Kokkos::complex<double>(1.305514883350938e-01, 0);
+    h_ref_cbj1(22) =
+        Kokkos::complex<double>(-1.305514883350938e-01, +7.993709105806192e-18);
+    h_ref_cbj1(23) = Kokkos::complex<double>(4.659838375816632e-02, 0);
+    h_ref_cbj1(24) =
+        Kokkos::complex<double>(-4.659838375816632e-02, +6.322680793358811e-18);
+
+    h_ref_cby1(0) = Kokkos::complex<double>(-infinity<double>::value, 0);
+    h_ref_cby1(1) =
+        Kokkos::complex<double>(1.285849341463599e+00, +7.250812532419394e-01);
+    h_ref_cby1(2) =
+        Kokkos::complex<double>(1.285849341463599e+00, -7.250812532419394e-01);
+    h_ref_cby1(3) =
+        Kokkos::complex<double>(1.236114779014097e+00, -8.352164439165690e-01);
+    h_ref_cby1(4) =
+        Kokkos::complex<double>(1.236114779014097e+00, +8.352164439165690e-01);
+    h_ref_cby1(5) =
+        Kokkos::complex<double>(1.576608512528508e+03, -7.469476251109801e+02);
+    h_ref_cby1(6) =
+        Kokkos::complex<double>(1.576608512528508e+03, +7.469476251109801e+02);
+    h_ref_cby1(7) =
+        Kokkos::complex<double>(1.576608497980113e+03, +7.469476255749524e+02);
+    h_ref_cby1(8) =
+        Kokkos::complex<double>(1.576608497980113e+03, -7.469476255749524e+02);
+    h_ref_cby1(9) = Kokkos::complex<double>(3.246744247918000e-01, 0);
+    h_ref_cby1(10) =
+        Kokkos::complex<double>(-3.246744247918000e-01, -6.781179170518730e-01);
+    h_ref_cby1(11) = Kokkos::complex<double>(1.616692009926331e-01, 0);
+    h_ref_cby1(12) =
+        Kokkos::complex<double>(-1.616692009926332e-01, +7.903864376740302e-02);
+    h_ref_cby1(13) =
+        Kokkos::complex<double>(1.027302268200224e+03, +1.233147093992241e+03);
+    h_ref_cby1(14) =
+        Kokkos::complex<double>(1.027302268200224e+03, -1.233147093992241e+03);
+    h_ref_cby1(15) =
+        Kokkos::complex<double>(1.027302263607999e+03, -1.233147106522383e+03);
+    h_ref_cby1(16) =
+        Kokkos::complex<double>(1.027302263607999e+03, +1.233147106522383e+03);
+    h_ref_cby1(17) =
+        Kokkos::complex<double>(1.042364943073579e+03, +4.248029112344685e+02);
+    h_ref_cby1(18) =
+        Kokkos::complex<double>(1.042364943073579e+03, -4.248029112344685e+02);
+    h_ref_cby1(19) =
+        Kokkos::complex<double>(1.042364935156525e+03, -4.248029161121132e+02);
+    h_ref_cby1(20) =
+        Kokkos::complex<double>(1.042364935156525e+03, +4.248029161121132e+02);
+    h_ref_cby1(21) = Kokkos::complex<double>(7.552212658226459e-02, 0);
+    h_ref_cby1(22) =
+        Kokkos::complex<double>(-7.552212658226459e-02, -2.611029766701876e-01);
+    h_ref_cby1(23) = Kokkos::complex<double>(9.186960936986688e-02, 0);
+    h_ref_cby1(24) =
+        Kokkos::complex<double>(-9.186960936986688e-02, -9.319676751633262e-02);
+
+    for (int i = 0; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cbj1(i) - h_ref_cbj1(i)),
+                Kokkos::abs(h_ref_cbj1(i)) * 1e-13);
+    }
+
+    EXPECT_EQ(h_ref_cby1(0), h_cby1(0));
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cby1(i) - h_ref_cby1(i)),
+                Kokkos::abs(h_ref_cby1(i)) * 1e-13);
+    }
+
+    ////Test large arguments
+    d_z_large        = ViewType("d_z_large", 6);
+    d_cbj1_large     = ViewType("d_cbj1_large", 6);
+    d_cby1_large     = ViewType("d_cby1_large", 6);
+    h_z_large        = Kokkos::create_mirror_view(d_z_large);
+    h_cbj1_large     = Kokkos::create_mirror_view(d_cbj1_large);
+    h_cby1_large     = Kokkos::create_mirror_view(d_cby1_large);
+    h_ref_cbj1_large = HostViewType("h_ref_cbj1_large", 2);
+    h_ref_cby1_large = HostViewType("h_ref_cby1_large", 2);
+
+    h_z_large(0) = Kokkos::complex<double>(10000.0, 100.0);
+    h_z_large(1) = Kokkos::complex<double>(10000.0, 100.0);
+    h_z_large(2) = Kokkos::complex<double>(10000.0, 100.0);
+    h_z_large(3) = Kokkos::complex<double>(-10000.0, 100.0);
+    h_z_large(4) = Kokkos::complex<double>(-10000.0, 100.0);
+    h_z_large(5) = Kokkos::complex<double>(-10000.0, 100.0);
+
+    Kokkos::deep_copy(d_z_large, h_z_large);
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, TestLargeArgTag>(0, 1),
+                         *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbj1_large, d_cbj1_large);
+    Kokkos::deep_copy(h_cby1_large, d_cby1_large);
+
+    h_ref_cbj1_large(0) =
+        Kokkos::complex<double>(4.854515317906369e+40, -9.562049455402486e+40);
+    h_ref_cbj1_large(1) =
+        Kokkos::complex<double>(-4.854515317906371e+40, -9.562049455402486e+40);
+
+    h_ref_cby1_large(0) =
+        Kokkos::complex<double>(9.562049455402486e+40, 4.854515317906369e+40);
+    h_ref_cby1_large(1) =
+        Kokkos::complex<double>(9.562049455402486e+40, -4.854515317906369e+40);
+
+    EXPECT_TRUE((Kokkos::abs(h_cbj1_large(0) - h_ref_cbj1_large(0)) <
+                 Kokkos::abs(h_ref_cbj1_large(0)) * 1e-12) &&
+                (Kokkos::abs(h_cbj1_large(0) - h_ref_cbj1_large(0)) >
+                 Kokkos::abs(h_ref_cbj1_large(0)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cbj1_large(1) - h_ref_cbj1_large(0)) >
+                Kokkos::abs(h_ref_cbj1_large(0)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cbj1_large(2) - h_ref_cbj1_large(0)) <
+                Kokkos::abs(h_ref_cbj1_large(0)) * 1e-13);
+    EXPECT_TRUE((Kokkos::abs(h_cbj1_large(3) - h_ref_cbj1_large(1)) <
+                 Kokkos::abs(h_ref_cbj1_large(1)) * 1e-12) &&
+                (Kokkos::abs(h_cbj1_large(3) - h_ref_cbj1_large(1)) >
+                 Kokkos::abs(h_ref_cbj1_large(1)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cbj1_large(4) - h_ref_cbj1_large(1)) >
+                Kokkos::abs(h_ref_cbj1_large(1)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cbj1_large(5) - h_ref_cbj1_large(1)) <
+                Kokkos::abs(h_ref_cbj1_large(1)) * 1e-13);
+
+    EXPECT_TRUE((Kokkos::abs(h_cby1_large(0) - h_ref_cby1_large(0)) <
+                 Kokkos::abs(h_ref_cby1_large(0)) * 1e-12) &&
+                (Kokkos::abs(h_cby1_large(0) - h_ref_cby1_large(0)) >
+                 Kokkos::abs(h_ref_cby1_large(0)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cby1_large(1) - h_ref_cby1_large(0)) >
+                Kokkos::abs(h_ref_cby1_large(0)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cby1_large(2) - h_ref_cby1_large(0)) <
+                Kokkos::abs(h_ref_cby1_large(0)) * 1e-13);
+    EXPECT_TRUE((Kokkos::abs(h_cby1_large(3) - h_ref_cby1_large(1)) <
+                 Kokkos::abs(h_ref_cby1_large(1)) * 1e-12) &&
+                (Kokkos::abs(h_cby1_large(3) - h_ref_cby1_large(1)) >
+                 Kokkos::abs(h_ref_cby1_large(1)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cby1_large(4) - h_ref_cby1_large(1)) >
+                Kokkos::abs(h_ref_cby1_large(1)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cby1_large(5) - h_ref_cby1_large(1)) <
+                Kokkos::abs(h_ref_cby1_large(1)) * 1e-13);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_cbj1(i) = Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+    d_cby1(i) = Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TestLargeArgTag&, const int& /*i*/) const {
+    d_cbj1_large(0) =
+        Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(0));
+    d_cbj1_large(1) =
+        Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(1), 11000, 3000);
+    d_cbj1_large(2) =
+        Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(2), 11000, 7500);
+    d_cbj1_large(3) =
+        Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(3));
+    d_cbj1_large(4) =
+        Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(4), 11000, 3000);
+    d_cbj1_large(5) =
+        Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(5), 11000, 7500);
+
+    d_cby1_large(0) =
+        Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(0));
+    d_cby1_large(1) =
+        Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(1), 11000, 3000);
+    d_cby1_large(2) =
+        Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(2), 11000, 7500);
+    d_cby1_large(3) =
+        Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(3));
+    d_cby1_large(4) =
+        Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(4), 11000, 3000);
+    d_cby1_large(5) =
+        Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(5), 11000, 7500);
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexBesselI0K0Function {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_cbi0, d_cbk0;
+  typename ViewType::HostMirror h_z, h_cbi0, h_cbk0;
+  HostViewType h_ref_cbi0, h_ref_cbk0;
+
+  ViewType d_z_large, d_cbi0_large, d_cbk0_large;
+  typename ViewType::HostMirror h_z_large, h_cbi0_large, h_cbk0_large;
+  HostViewType h_ref_cbi0_large, h_ref_cbk0_large;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    int N      = 25;
+    d_z        = ViewType("d_z", N);
+    d_cbi0     = ViewType("d_cbi0", N);
+    d_cbk0     = ViewType("d_cbk0", N);
+    h_z        = Kokkos::create_mirror_view(d_z);
+    h_cbi0     = Kokkos::create_mirror_view(d_cbi0);
+    h_cbk0     = Kokkos::create_mirror_view(d_cbk0);
+    h_ref_cbi0 = HostViewType("h_ref_cbi0", N);
+    h_ref_cbk0 = HostViewType("h_ref_cbk0", N);
+
+    // Generate test inputs
+    h_z(0)  = Kokkos::complex<double>(0.0, 0.0);
+    h_z(1)  = Kokkos::complex<double>(3.0, 2.0);
+    h_z(2)  = Kokkos::complex<double>(3.0, -2.0);
+    h_z(3)  = Kokkos::complex<double>(-3.0, 2.0);
+    h_z(4)  = Kokkos::complex<double>(-3.0, -2.0);
+    h_z(5)  = Kokkos::complex<double>(23.0, 10.0);
+    h_z(6)  = Kokkos::complex<double>(23.0, -10.0);
+    h_z(7)  = Kokkos::complex<double>(-23.0, 10.0);
+    h_z(8)  = Kokkos::complex<double>(-23.0, -10.0);
+    h_z(9)  = Kokkos::complex<double>(3.0, 0.0);
+    h_z(10) = Kokkos::complex<double>(-3.0, 0.0);
+    h_z(11) = Kokkos::complex<double>(23.0, 0.0);
+    h_z(12) = Kokkos::complex<double>(-23.0, 0.0);
+    h_z(13) = Kokkos::complex<double>(28.0, 10.0);
+    h_z(14) = Kokkos::complex<double>(28.0, -10.0);
+    h_z(15) = Kokkos::complex<double>(-28.0, 10.0);
+    h_z(16) = Kokkos::complex<double>(-28.0, -10.0);
+    h_z(17) = Kokkos::complex<double>(60.0, 10.0);
+    h_z(18) = Kokkos::complex<double>(60.0, -10.0);
+    h_z(19) = Kokkos::complex<double>(-60.0, 10.0);
+    h_z(20) = Kokkos::complex<double>(-60.0, -10.0);
+    h_z(21) = Kokkos::complex<double>(28.0, 0.0);
+    h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
+    h_z(23) = Kokkos::complex<double>(60.0, 0.0);
+    h_z(24) = Kokkos::complex<double>(-60.0, 0.0);
+
+    Kokkos::deep_copy(d_z, h_z);
+
+    // Call Bessel functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbi0, d_cbi0);
+    Kokkos::deep_copy(h_cbk0, d_cbk0);
+
+    // Reference values computed with Octave
+    h_ref_cbi0(0) = Kokkos::complex<double>(1.000000000000000e+00, 0);
+    h_ref_cbi0(1) =
+        Kokkos::complex<double>(-4.695171920440706e-01, +4.313788409468920e+00);
+    h_ref_cbi0(2) =
+        Kokkos::complex<double>(-4.695171920440706e-01, -4.313788409468920e+00);
+    h_ref_cbi0(3) =
+        Kokkos::complex<double>(-4.695171920440706e-01, -4.313788409468920e+00);
+    h_ref_cbi0(4) =
+        Kokkos::complex<double>(-4.695171920440706e-01, +4.313788409468920e+00);
+    h_ref_cbi0(5) =
+        Kokkos::complex<double>(-7.276526052028507e+08, -2.806354803468570e+08);
+    h_ref_cbi0(6) =
+        Kokkos::complex<double>(-7.276526052028507e+08, +2.806354803468570e+08);
+    h_ref_cbi0(7) =
+        Kokkos::complex<double>(-7.276526052028507e+08, +2.806354803468570e+08);
+    h_ref_cbi0(8) =
+        Kokkos::complex<double>(-7.276526052028507e+08, -2.806354803468570e+08);
+    h_ref_cbi0(9)  = Kokkos::complex<double>(4.880792585865025e+00, 0);
+    h_ref_cbi0(10) = Kokkos::complex<double>(4.880792585865025e+00, 0);
+    h_ref_cbi0(11) = Kokkos::complex<double>(8.151421225128924e+08, 0);
+    h_ref_cbi0(12) = Kokkos::complex<double>(8.151421225128924e+08, 0);
+    h_ref_cbi0(13) =
+        Kokkos::complex<double>(-9.775983282455373e+10, -4.159160389327644e+10);
+    h_ref_cbi0(14) =
+        Kokkos::complex<double>(-9.775983282455373e+10, +4.159160389327644e+10);
+    h_ref_cbi0(15) =
+        Kokkos::complex<double>(-9.775983282455373e+10, +4.159160389327644e+10);
+    h_ref_cbi0(16) =
+        Kokkos::complex<double>(-9.775983282455373e+10, -4.159160389327644e+10);
+    h_ref_cbi0(17) =
+        Kokkos::complex<double>(-5.158377566681892e+24, -2.766704059464302e+24);
+    h_ref_cbi0(18) =
+        Kokkos::complex<double>(-5.158377566681892e+24, +2.766704059464302e+24);
+    h_ref_cbi0(19) =
+        Kokkos::complex<double>(-5.158377566681892e+24, +2.766704059464302e+24);
+    h_ref_cbi0(20) =
+        Kokkos::complex<double>(-5.158377566681892e+24, -2.766704059464302e+24);
+    h_ref_cbi0(21) = Kokkos::complex<double>(1.095346047317573e+11, 0);
+    h_ref_cbi0(22) = Kokkos::complex<double>(1.095346047317573e+11, 0);
+    h_ref_cbi0(23) = Kokkos::complex<double>(5.894077055609803e+24, 0);
+    h_ref_cbi0(24) = Kokkos::complex<double>(5.894077055609803e+24, 0);
+
+    h_ref_cbk0(0) = Kokkos::complex<double>(infinity<double>::value, 0);
+    h_ref_cbk0(1) =
+        Kokkos::complex<double>(-2.078722558742977e-02, -2.431266356716766e-02);
+    h_ref_cbk0(2) =
+        Kokkos::complex<double>(-2.078722558742977e-02, +2.431266356716766e-02);
+    h_ref_cbk0(3) =
+        Kokkos::complex<double>(-1.357295320191579e+01, +1.499344424826928e+00);
+    h_ref_cbk0(4) =
+        Kokkos::complex<double>(-1.357295320191579e+01, -1.499344424826928e+00);
+    h_ref_cbk0(5) =
+        Kokkos::complex<double>(-1.820476218131465e-11, +1.795056004780177e-11);
+    h_ref_cbk0(6) =
+        Kokkos::complex<double>(-1.820476218131465e-11, -1.795056004780177e-11);
+    h_ref_cbk0(7) =
+        Kokkos::complex<double>(8.816423633943287e+08, +2.285988078870750e+09);
+    h_ref_cbk0(8) =
+        Kokkos::complex<double>(8.816423633943287e+08, -2.285988078870750e+09);
+    h_ref_cbk0(9) = Kokkos::complex<double>(3.473950438627926e-02, 0);
+    h_ref_cbk0(10) =
+        Kokkos::complex<double>(3.473950438627926e-02, -1.533346213144909e+01);
+    h_ref_cbk0(11) = Kokkos::complex<double>(2.667545110351910e-11, 0);
+    h_ref_cbk0(12) =
+        Kokkos::complex<double>(2.667545110351910e-11, -2.560844503718094e+09);
+    h_ref_cbk0(13) =
+        Kokkos::complex<double>(-1.163319528590747e-13, +1.073711234918388e-13);
+    h_ref_cbk0(14) =
+        Kokkos::complex<double>(-1.163319528590747e-13, -1.073711234918388e-13);
+    h_ref_cbk0(15) =
+        Kokkos::complex<double>(1.306638772421339e+11, +3.071215726177843e+11);
+    h_ref_cbk0(16) =
+        Kokkos::complex<double>(1.306638772421339e+11, -3.071215726177843e+11);
+    h_ref_cbk0(17) =
+        Kokkos::complex<double>(-1.111584549467388e-27, +8.581979311477652e-28);
+    h_ref_cbk0(18) =
+        Kokkos::complex<double>(-1.111584549467388e-27, -8.581979311477652e-28);
+    h_ref_cbk0(19) =
+        Kokkos::complex<double>(8.691857147870108e+24, +1.620552106793022e+25);
+    h_ref_cbk0(20) =
+        Kokkos::complex<double>(8.691857147870108e+24, -1.620552106793022e+25);
+    h_ref_cbk0(21) = Kokkos::complex<double>(1.630534586888181e-13, 0);
+    h_ref_cbk0(22) =
+        Kokkos::complex<double>(1.630534586888181e-13, -3.441131095391506e+11);
+    h_ref_cbk0(23) = Kokkos::complex<double>(1.413897840559108e-27, 0);
+    h_ref_cbk0(24) =
+        Kokkos::complex<double>(1.413897840559108e-27, -1.851678917759592e+25);
+
+    for (int i = 0; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cbi0(i) - h_ref_cbi0(i)),
+                Kokkos::abs(h_ref_cbi0(i)) * 1e-13);
+    }
+
+    EXPECT_EQ(h_ref_cbk0(0), h_cbk0(0));
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cbk0(i) - h_ref_cbk0(i)),
+                Kokkos::abs(h_ref_cbk0(i)) * 1e-13);
+    }
+
+    ////Test large arguments
+    d_z_large        = ViewType("d_z_large", 6);
+    d_cbi0_large     = ViewType("d_cbi0_large", 6);
+    h_z_large        = Kokkos::create_mirror_view(d_z_large);
+    h_cbi0_large     = Kokkos::create_mirror_view(d_cbi0_large);
+    h_ref_cbi0_large = HostViewType("h_ref_cbi0_large", 2);
+
+    h_z_large(0) = Kokkos::complex<double>(100.0, 10.0);
+    h_z_large(1) = Kokkos::complex<double>(100.0, 10.0);
+    h_z_large(2) = Kokkos::complex<double>(100.0, 10.0);
+    h_z_large(3) = Kokkos::complex<double>(-100.0, 10.0);
+    h_z_large(4) = Kokkos::complex<double>(-100.0, 10.0);
+    h_z_large(5) = Kokkos::complex<double>(-100.0, 10.0);
+
+    Kokkos::deep_copy(d_z_large, h_z_large);
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, TestLargeArgTag>(0, 1),
+                         *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbi0_large, d_cbi0_large);
+
+    h_ref_cbi0_large(0) =
+        Kokkos::complex<double>(-9.266819049505678e+41, -5.370779383266049e+41);
+    h_ref_cbi0_large(1) =
+        Kokkos::complex<double>(-9.266819049505678e+41, +5.370779383266049e+41);
+
+    EXPECT_TRUE(Kokkos::abs(h_cbi0_large(0) - h_ref_cbi0_large(0)) <
+                Kokkos::abs(h_ref_cbi0_large(0)) * 1e-15);
+    EXPECT_TRUE(Kokkos::abs(h_cbi0_large(1) - h_ref_cbi0_large(0)) >
+                Kokkos::abs(h_ref_cbi0_large(0)) * 1e-4);
+    EXPECT_TRUE(Kokkos::abs(h_cbi0_large(2) - h_ref_cbi0_large(0)) <
+                Kokkos::abs(h_ref_cbi0_large(0)) * 1e-15);
+    EXPECT_TRUE(Kokkos::abs(h_cbi0_large(3) - h_ref_cbi0_large(1)) <
+                Kokkos::abs(h_ref_cbi0_large(1)) * 1e-15);
+    EXPECT_TRUE(Kokkos::abs(h_cbi0_large(4) - h_ref_cbi0_large(1)) >
+                Kokkos::abs(h_ref_cbi0_large(1)) * 1e-4);
+    EXPECT_TRUE(Kokkos::abs(h_cbi0_large(5) - h_ref_cbi0_large(1)) <
+                Kokkos::abs(h_ref_cbi0_large(1)) * 1e-15);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_cbi0(i) = Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+    d_cbk0(i) = Kokkos::Experimental::cyl_bessel_k0<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TestLargeArgTag&, const int& /*i*/) const {
+    d_cbi0_large(0) =
+        Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(0));
+    d_cbi0_large(1) =
+        Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(1), 110, 35);
+    d_cbi0_large(2) =
+        Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(2), 110, 190);
+    d_cbi0_large(3) =
+        Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(3));
+    d_cbi0_large(4) =
+        Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(4), 110, 35);
+    d_cbi0_large(5) =
+        Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(5), 110, 190);
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexBesselI1K1Function {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_cbi1, d_cbk1;
+  typename ViewType::HostMirror h_z, h_cbi1, h_cbk1;
+  HostViewType h_ref_cbi1, h_ref_cbk1;
+
+  ViewType d_z_large, d_cbi1_large, d_cbk1_large;
+  typename ViewType::HostMirror h_z_large, h_cbi1_large, h_cbk1_large;
+  HostViewType h_ref_cbi1_large, h_ref_cbk1_large;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    int N      = 25;
+    d_z        = ViewType("d_z", N);
+    d_cbi1     = ViewType("d_cbi1", N);
+    d_cbk1     = ViewType("d_cbk1", N);
+    h_z        = Kokkos::create_mirror_view(d_z);
+    h_cbi1     = Kokkos::create_mirror_view(d_cbi1);
+    h_cbk1     = Kokkos::create_mirror_view(d_cbk1);
+    h_ref_cbi1 = HostViewType("h_ref_cbi1", N);
+    h_ref_cbk1 = HostViewType("h_ref_cbk1", N);
+
+    // Generate test inputs
+    h_z(0)  = Kokkos::complex<double>(0.0, 0.0);
+    h_z(1)  = Kokkos::complex<double>(3.0, 2.0);
+    h_z(2)  = Kokkos::complex<double>(3.0, -2.0);
+    h_z(3)  = Kokkos::complex<double>(-3.0, 2.0);
+    h_z(4)  = Kokkos::complex<double>(-3.0, -2.0);
+    h_z(5)  = Kokkos::complex<double>(23.0, 10.0);
+    h_z(6)  = Kokkos::complex<double>(23.0, -10.0);
+    h_z(7)  = Kokkos::complex<double>(-23.0, 10.0);
+    h_z(8)  = Kokkos::complex<double>(-23.0, -10.0);
+    h_z(9)  = Kokkos::complex<double>(3.0, 0.0);
+    h_z(10) = Kokkos::complex<double>(-3.0, 0.0);
+    h_z(11) = Kokkos::complex<double>(23.0, 0.0);
+    h_z(12) = Kokkos::complex<double>(-23.0, 0.0);
+    h_z(13) = Kokkos::complex<double>(28.0, 10.0);
+    h_z(14) = Kokkos::complex<double>(28.0, -10.0);
+    h_z(15) = Kokkos::complex<double>(-28.0, 10.0);
+    h_z(16) = Kokkos::complex<double>(-28.0, -10.0);
+    h_z(17) = Kokkos::complex<double>(60.0, 10.0);
+    h_z(18) = Kokkos::complex<double>(60.0, -10.0);
+    h_z(19) = Kokkos::complex<double>(-60.0, 10.0);
+    h_z(20) = Kokkos::complex<double>(-60.0, -10.0);
+    h_z(21) = Kokkos::complex<double>(28.0, 0.0);
+    h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
+    h_z(23) = Kokkos::complex<double>(60.0, 0.0);
+    h_z(24) = Kokkos::complex<double>(-60.0, 0.0);
+
+    Kokkos::deep_copy(d_z, h_z);
+
+    // Call Bessel functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbi1, d_cbi1);
+    Kokkos::deep_copy(h_cbk1, d_cbk1);
+
+    // Reference values computed with Octave
+    h_ref_cbi1(0) = Kokkos::complex<double>(0, 0);
+    h_ref_cbi1(1) =
+        Kokkos::complex<double>(-8.127809410735776e-01, +3.780682961371298e+00);
+    h_ref_cbi1(2) =
+        Kokkos::complex<double>(-8.127809410735776e-01, -3.780682961371298e+00);
+    h_ref_cbi1(3) =
+        Kokkos::complex<double>(8.127809410735776e-01, +3.780682961371298e+00);
+    h_ref_cbi1(4) =
+        Kokkos::complex<double>(8.127809410735776e-01, -3.780682961371298e+00);
+    h_ref_cbi1(5) =
+        Kokkos::complex<double>(-7.119745937677552e+08, -2.813616375214342e+08);
+    h_ref_cbi1(6) =
+        Kokkos::complex<double>(-7.119745937677552e+08, +2.813616375214342e+08);
+    h_ref_cbi1(7) =
+        Kokkos::complex<double>(7.119745937677552e+08, -2.813616375214342e+08);
+    h_ref_cbi1(8) =
+        Kokkos::complex<double>(7.119745937677552e+08, +2.813616375214342e+08);
+    h_ref_cbi1(9)  = Kokkos::complex<double>(3.953370217402609e+00, 0);
+    h_ref_cbi1(10) = Kokkos::complex<double>(-3.953370217402609e+00, 0);
+    h_ref_cbi1(11) = Kokkos::complex<double>(7.972200260896506e+08, 0);
+    h_ref_cbi1(12) = Kokkos::complex<double>(-7.972200260896506e+08, 0);
+    h_ref_cbi1(13) =
+        Kokkos::complex<double>(-9.596150723281404e+10, -4.149038020045121e+10);
+    h_ref_cbi1(14) =
+        Kokkos::complex<double>(-9.596150723281404e+10, +4.149038020045121e+10);
+    h_ref_cbi1(15) =
+        Kokkos::complex<double>(9.596150723281404e+10, -4.149038020045121e+10);
+    h_ref_cbi1(16) =
+        Kokkos::complex<double>(9.596150723281404e+10, +4.149038020045121e+10);
+    h_ref_cbi1(17) =
+        Kokkos::complex<double>(-5.112615594220387e+24, -2.751210232069100e+24);
+    h_ref_cbi1(18) =
+        Kokkos::complex<double>(-5.112615594220387e+24, +2.751210232069100e+24);
+    h_ref_cbi1(19) =
+        Kokkos::complex<double>(5.112615594220387e+24, -2.751210232069100e+24);
+    h_ref_cbi1(20) =
+        Kokkos::complex<double>(5.112615594220387e+24, +2.751210232069100e+24);
+    h_ref_cbi1(21) = Kokkos::complex<double>(1.075605042080823e+11, 0);
+    h_ref_cbi1(22) = Kokkos::complex<double>(-1.075605042080823e+11, 0);
+    h_ref_cbi1(23) = Kokkos::complex<double>(5.844751588390470e+24, 0);
+    h_ref_cbi1(24) = Kokkos::complex<double>(-5.844751588390470e+24, 0);
+
+    h_ref_cbk1(0) = Kokkos::complex<double>(infinity<double>::value, 0);
+    h_ref_cbk1(1) =
+        Kokkos::complex<double>(-2.480952007015153e-02, -2.557074905635180e-02);
+    h_ref_cbk1(2) =
+        Kokkos::complex<double>(-2.480952007015153e-02, +2.557074905635180e-02);
+    h_ref_cbk1(3) =
+        Kokkos::complex<double>(-1.185255629692602e+01, +2.527855884398198e+00);
+    h_ref_cbk1(4) =
+        Kokkos::complex<double>(-1.185255629692602e+01, -2.527855884398198e+00);
+    h_ref_cbk1(5) =
+        Kokkos::complex<double>(-1.839497240093994e-11, +1.841855854336314e-11);
+    h_ref_cbk1(6) =
+        Kokkos::complex<double>(-1.839497240093994e-11, -1.841855854336314e-11);
+    h_ref_cbk1(7) =
+        Kokkos::complex<double>(8.839236534393319e+08, +2.236734153323357e+09);
+    h_ref_cbk1(8) =
+        Kokkos::complex<double>(8.839236534393319e+08, -2.236734153323357e+09);
+    h_ref_cbk1(9) = Kokkos::complex<double>(4.015643112819419e-02, 0);
+    h_ref_cbk1(10) =
+        Kokkos::complex<double>(-4.015643112819419e-02, -1.241987883191272e+01);
+    h_ref_cbk1(11) = Kokkos::complex<double>(2.724930589574976e-11, 0);
+    h_ref_cbk1(12) =
+        Kokkos::complex<double>(-2.724930589574976e-11, -2.504540577257910e+09);
+    h_ref_cbk1(13) =
+        Kokkos::complex<double>(-1.175637676331817e-13, +1.097080943197297e-13);
+    h_ref_cbk1(14) =
+        Kokkos::complex<double>(-1.175637676331817e-13, -1.097080943197297e-13);
+    h_ref_cbk1(15) =
+        Kokkos::complex<double>(1.303458736323849e+11, +3.014719661500124e+11);
+    h_ref_cbk1(16) =
+        Kokkos::complex<double>(1.303458736323849e+11, -3.014719661500124e+11);
+    h_ref_cbk1(17) =
+        Kokkos::complex<double>(-1.119411861396158e-27, +8.666195226392352e-28);
+    h_ref_cbk1(18) =
+        Kokkos::complex<double>(-1.119411861396158e-27, -8.666195226392352e-28);
+    h_ref_cbk1(19) =
+        Kokkos::complex<double>(8.643181853549355e+24, +1.606175559143138e+25);
+    h_ref_cbk1(20) =
+        Kokkos::complex<double>(8.643181853549355e+24, -1.606175559143138e+25);
+    h_ref_cbk1(21) = Kokkos::complex<double>(1.659400107332009e-13, 0);
+    h_ref_cbk1(22) =
+        Kokkos::complex<double>(-1.659400107332009e-13, -3.379112898365253e+11);
+    h_ref_cbk1(23) = Kokkos::complex<double>(1.425632026517104e-27, 0);
+    h_ref_cbk1(24) =
+        Kokkos::complex<double>(-1.425632026517104e-27, -1.836182865214478e+25);
+
+    for (int i = 0; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cbi1(i) - h_ref_cbi1(i)),
+                Kokkos::abs(h_ref_cbi1(i)) * 1e-13);
+    }
+
+    EXPECT_EQ(h_ref_cbk1(0), h_cbk1(0));
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cbk1(i) - h_ref_cbk1(i)),
+                Kokkos::abs(h_ref_cbk1(i)) * 1e-13);
+    }
+
+    ////Test large arguments
+    d_z_large        = ViewType("d_z_large", 6);
+    d_cbi1_large     = ViewType("d_cbi1_large", 6);
+    h_z_large        = Kokkos::create_mirror_view(d_z_large);
+    h_cbi1_large     = Kokkos::create_mirror_view(d_cbi1_large);
+    h_ref_cbi1_large = HostViewType("h_ref_cbi1_large", 2);
+
+    h_z_large(0) = Kokkos::complex<double>(100.0, 10.0);
+    h_z_large(1) = Kokkos::complex<double>(100.0, 10.0);
+    h_z_large(2) = Kokkos::complex<double>(100.0, 10.0);
+    h_z_large(3) = Kokkos::complex<double>(-100.0, 10.0);
+    h_z_large(4) = Kokkos::complex<double>(-100.0, 10.0);
+    h_z_large(5) = Kokkos::complex<double>(-100.0, 10.0);
+
+    Kokkos::deep_copy(d_z_large, h_z_large);
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, TestLargeArgTag>(0, 1),
+                         *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbi1_large, d_cbi1_large);
+
+    h_ref_cbi1_large(0) =
+        Kokkos::complex<double>(-9.218158020154234e+41, -5.348736158968607e+41);
+    h_ref_cbi1_large(1) =
+        Kokkos::complex<double>(9.218158020154234e+41, -5.348736158968607e+41);
+
+    EXPECT_TRUE(Kokkos::abs(h_cbi1_large(0) - h_ref_cbi1_large(0)) <
+                Kokkos::abs(h_ref_cbi1_large(0)) * 1e-15);
+    EXPECT_TRUE(Kokkos::abs(h_cbi1_large(1) - h_ref_cbi1_large(0)) >
+                Kokkos::abs(h_ref_cbi1_large(0)) * 1e-4);
+    EXPECT_TRUE(Kokkos::abs(h_cbi1_large(2) - h_ref_cbi1_large(0)) <
+                Kokkos::abs(h_ref_cbi1_large(0)) * 1e-15);
+    EXPECT_TRUE(Kokkos::abs(h_cbi1_large(3) - h_ref_cbi1_large(1)) <
+                Kokkos::abs(h_ref_cbi1_large(1)) * 1e-15);
+    EXPECT_TRUE(Kokkos::abs(h_cbi1_large(4) - h_ref_cbi1_large(1)) >
+                Kokkos::abs(h_ref_cbi1_large(1)) * 1e-4);
+    EXPECT_TRUE(Kokkos::abs(h_cbi1_large(5) - h_ref_cbi1_large(1)) <
+                Kokkos::abs(h_ref_cbi1_large(1)) * 1e-15);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_cbi1(i) = Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+    d_cbk1(i) = Kokkos::Experimental::cyl_bessel_k1<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TestLargeArgTag&, const int& /*i*/) const {
+    d_cbi1_large(0) =
+        Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(0));
+    d_cbi1_large(1) =
+        Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(1), 110, 35);
+    d_cbi1_large(2) =
+        Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(2), 110, 190);
+    d_cbi1_large(3) =
+        Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(3));
+    d_cbi1_large(4) =
+        Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(4), 110, 35);
+    d_cbi1_large(5) =
+        Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(5), 110, 190);
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexBesselH1Function {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_ch10, d_ch11;
+  typename ViewType::HostMirror h_z, h_ch10, h_ch11;
+  HostViewType h_ref_ch10, h_ref_ch11;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    int N      = 25;
+    d_z        = ViewType("d_z", N);
+    d_ch10     = ViewType("d_ch10", N);
+    d_ch11     = ViewType("d_ch11", N);
+    h_z        = Kokkos::create_mirror_view(d_z);
+    h_ch10     = Kokkos::create_mirror_view(d_ch10);
+    h_ch11     = Kokkos::create_mirror_view(d_ch11);
+    h_ref_ch10 = HostViewType("h_ref_ch10", N);
+    h_ref_ch11 = HostViewType("h_ref_ch11", N);
+
+    // Generate test inputs
+    h_z(0)  = Kokkos::complex<double>(0.0, 0.0);
+    h_z(1)  = Kokkos::complex<double>(3.0, 2.0);
+    h_z(2)  = Kokkos::complex<double>(3.0, -2.0);
+    h_z(3)  = Kokkos::complex<double>(-3.0, 2.0);
+    h_z(4)  = Kokkos::complex<double>(-3.0, -2.0);
+    h_z(5)  = Kokkos::complex<double>(23.0, 10.0);
+    h_z(6)  = Kokkos::complex<double>(23.0, -10.0);
+    h_z(7)  = Kokkos::complex<double>(-23.0, 10.0);
+    h_z(8)  = Kokkos::complex<double>(-23.0, -10.0);
+    h_z(9)  = Kokkos::complex<double>(3.0, 0.0);
+    h_z(10) = Kokkos::complex<double>(-3.0, 0.0);
+    h_z(11) = Kokkos::complex<double>(23.0, 0.0);
+    h_z(12) = Kokkos::complex<double>(-23.0, 0.0);
+    h_z(13) = Kokkos::complex<double>(28.0, 10.0);
+    h_z(14) = Kokkos::complex<double>(28.0, -10.0);
+    h_z(15) = Kokkos::complex<double>(-28.0, 10.0);
+    h_z(16) = Kokkos::complex<double>(-28.0, -10.0);
+    h_z(17) = Kokkos::complex<double>(200.0, 60.0);
+    h_z(18) = Kokkos::complex<double>(200.0, -60.0);
+    h_z(19) = Kokkos::complex<double>(-200.0, 60.0);
+    h_z(20) = Kokkos::complex<double>(-200.0, -60.0);
+    h_z(21) = Kokkos::complex<double>(28.0, 0.0);
+    h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
+    h_z(23) = Kokkos::complex<double>(200.0, 0.0);
+    h_z(24) = Kokkos::complex<double>(-200.0, 0.0);
+
+    Kokkos::deep_copy(d_z, h_z);
+
+    // Call Hankel functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_ch10, d_ch10);
+    Kokkos::deep_copy(h_ch11, d_ch11);
+
+    // Reference values computed with Octave
+    h_ref_ch10(0) = Kokkos::complex<double>(1.0, -infinity<double>::value);
+    h_ref_ch10(1) =
+        Kokkos::complex<double>(-1.779327030399459e-02, +5.281940449715537e-02);
+    h_ref_ch10(2) =
+        Kokkos::complex<double>(-2.480676488910849e+00, +1.948786988612626e+00);
+    h_ref_ch10(3) =
+        Kokkos::complex<double>(1.779327030399459e-02, +5.281940449715537e-02);
+    h_ref_ch10(4) =
+        Kokkos::complex<double>(-2.516263029518839e+00, -1.843148179618315e+00);
+    h_ref_ch10(5) =
+        Kokkos::complex<double>(-7.217716938222564e-06, -1.002796203581228e-07);
+    h_ref_ch10(6) =
+        Kokkos::complex<double>(-3.204879955218674e+03, -1.446133490498241e+03);
+    h_ref_ch10(7) =
+        Kokkos::complex<double>(7.217716938222564e-06, -1.002796203581228e-07);
+    h_ref_ch10(8) =
+        Kokkos::complex<double>(-3.204879969654108e+03, +1.446133490297682e+03);
+    h_ref_ch10(9) =
+        Kokkos::complex<double>(-2.600519549019334e-01, +3.768500100127903e-01);
+    h_ref_ch10(10) =
+        Kokkos::complex<double>(2.600519549019334e-01, +3.768500100127903e-01);
+    h_ref_ch10(11) =
+        Kokkos::complex<double>(-1.624127813134865e-01, -3.598179027370283e-02);
+    h_ref_ch10(12) =
+        Kokkos::complex<double>(1.624127813134865e-01, -3.598179027370283e-02);
+    h_ref_ch10(13) =
+        Kokkos::complex<double>(-2.184905481759440e-06, +6.263387166445335e-06);
+    h_ref_ch10(14) =
+        Kokkos::complex<double>(-2.025824374843011e+03, +2.512479278555672e+03);
+    h_ref_ch10(15) =
+        Kokkos::complex<double>(2.184905481759440e-06, +6.263387166445335e-06);
+    h_ref_ch10(16) =
+        Kokkos::complex<double>(-2.025824379212821e+03, -2.512479266028897e+03);
+    h_ref_ch10(17) =
+        Kokkos::complex<double>(-1.983689762743337e-28, -4.408449940359881e-28);
+    h_ref_ch10(18) =
+        Kokkos::complex<double>(-8.261945332108929e+23, -6.252486138159269e+24);
+    h_ref_ch10(19) =
+        Kokkos::complex<double>(1.983689762743337e-28, -4.408449940359881e-28);
+    h_ref_ch10(20) =
+        Kokkos::complex<double>(-8.261945332108929e+23, +6.252486138159269e+24);
+    h_ref_ch10(21) =
+        Kokkos::complex<double>(-7.315701054899959e-02, +1.318364704235323e-01);
+    h_ref_ch10(22) =
+        Kokkos::complex<double>(7.315701054899959e-02, +1.318364704235323e-01);
+    h_ref_ch10(23) =
+        Kokkos::complex<double>(-1.543743993056510e-02, -5.426577524981793e-02);
+    h_ref_ch10(24) =
+        Kokkos::complex<double>(1.543743993056510e-02, -5.426577524981793e-02);
+
+    h_ref_ch11(0) = Kokkos::complex<double>(0.0, -infinity<double>::value);
+    h_ref_ch11(1) =
+        Kokkos::complex<double>(5.506759533731469e-02, +2.486728122475093e-02);
+    h_ref_ch11(2) =
+        Kokkos::complex<double>(1.505230101821194e+00, +2.546831401702448e+00);
+    h_ref_ch11(3) =
+        Kokkos::complex<double>(5.506759533731469e-02, -2.486728122475093e-02);
+    h_ref_ch11(4) =
+        Kokkos::complex<double>(-1.615365292495823e+00, +2.497096839252946e+00);
+    h_ref_ch11(5) =
+        Kokkos::complex<double>(-2.319863729607219e-07, +7.274197719836158e-06);
+    h_ref_ch11(6) =
+        Kokkos::complex<double>(-1.493895250453947e+03, +3.153217017782819e+03);
+    h_ref_ch11(7) =
+        Kokkos::complex<double>(-2.319863729607210e-07, -7.274197719836158e-06);
+    h_ref_ch11(8) =
+        Kokkos::complex<double>(1.493895250917918e+03, +3.153217003234423e+03);
+    h_ref_ch11(9) =
+        Kokkos::complex<double>(3.390589585259364e-01, +3.246744247918000e-01);
+    h_ref_ch11(10) =
+        Kokkos::complex<double>(3.390589585259364e-01, -3.246744247918000e-01);
+    h_ref_ch11(11) =
+        Kokkos::complex<double>(-3.951932188370152e-02, +1.616692009926331e-01);
+    h_ref_ch11(12) =
+        Kokkos::complex<double>(-3.951932188370151e-02, -1.616692009926331e-01);
+    h_ref_ch11(13) =
+        Kokkos::complex<double>(6.265071091331731e-06, +2.296112637347948e-06);
+    h_ref_ch11(14) =
+        Kokkos::complex<double>(2.466294194249553e+03, +2.054604534104335e+03);
+    h_ref_ch11(15) =
+        Kokkos::complex<double>(6.265071091331731e-06, -2.296112637347947e-06);
+    h_ref_ch11(16) =
+        Kokkos::complex<double>(-2.466294206779695e+03, +2.054604529512110e+03);
+    h_ref_ch11(17) =
+        Kokkos::complex<double>(-4.416040381930448e-28, +1.974955285825768e-28);
+    h_ref_ch11(18) =
+        Kokkos::complex<double>(-6.250095237987940e+24, +8.112776606830997e+23);
+    h_ref_ch11(19) =
+        Kokkos::complex<double>(-4.416040381930448e-28, -1.974955285825769e-28);
+    h_ref_ch11(20) =
+        Kokkos::complex<double>(6.250095237987940e+24, +8.112776606831005e+23);
+    h_ref_ch11(21) =
+        Kokkos::complex<double>(1.305514883350938e-01, +7.552212658226459e-02);
+    h_ref_ch11(22) =
+        Kokkos::complex<double>(1.305514883350938e-01, -7.552212658226456e-02);
+    h_ref_ch11(23) =
+        Kokkos::complex<double>(-5.430453818237824e-02, +1.530182458038999e-02);
+    h_ref_ch11(24) =
+        Kokkos::complex<double>(-5.430453818237824e-02, -1.530182458039000e-02);
+
+    EXPECT_EQ(h_ref_ch10(0), h_ch10(0));
+    std::cout << "h_ch10(0): " << h_ch10(0)
+              << ", h_ref_ch10(0): " << h_ref_ch10(0) << std::endl;
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_ch10(i) - h_ref_ch10(i)),
+                Kokkos::abs(h_ref_ch10(i)) * 1e-13);
+      std::cout << i
+                << ", actual diff: " << Kokkos::abs(h_ch10(i) - h_ref_ch10(i))
+                << ", expected diff: " << Kokkos::abs(h_ref_ch10(i)) * 1e-13
+                << std::endl;
+    }
+
+    EXPECT_EQ(h_ref_ch11(0), h_ch11(0));
+    std::cout << "h_ch11(0): " << h_ch11(0)
+              << ", h_ref_ch11(0): " << h_ref_ch11(0) << std::endl;
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_ch11(i) - h_ref_ch11(i)),
+                Kokkos::abs(h_ref_ch11(i)) * 1e-13);
+      std::cout << i
+                << ", actual diff: " << Kokkos::abs(h_ch11(i) - h_ref_ch11(i))
+                << ", expected diff: " << Kokkos::abs(h_ref_ch11(i)) * 1e-13
+                << std::endl;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_ch10(i) = Kokkos::Experimental::cyl_bessel_h10(d_z(i));
+    d_ch11(i) = Kokkos::Experimental::cyl_bessel_h11(d_z(i));
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexBesselH2Function {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_ch20, d_ch21;
+  typename ViewType::HostMirror h_z, h_ch20, h_ch21;
+  HostViewType h_ref_ch20, h_ref_ch21;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    int N      = 25;
+    d_z        = ViewType("d_z", N);
+    d_ch20     = ViewType("d_ch20", N);
+    d_ch21     = ViewType("d_ch21", N);
+    h_z        = Kokkos::create_mirror_view(d_z);
+    h_ch20     = Kokkos::create_mirror_view(d_ch20);
+    h_ch21     = Kokkos::create_mirror_view(d_ch21);
+    h_ref_ch20 = HostViewType("h_ref_ch20", N);
+    h_ref_ch21 = HostViewType("h_ref_ch21", N);
+
+    // Generate test inputs
+    h_z(0)  = Kokkos::complex<double>(0.0, 0.0);
+    h_z(1)  = Kokkos::complex<double>(3.0, 2.0);
+    h_z(2)  = Kokkos::complex<double>(3.0, -2.0);
+    h_z(3)  = Kokkos::complex<double>(-3.0, 2.0);
+    h_z(4)  = Kokkos::complex<double>(-3.0, -2.0);
+    h_z(5)  = Kokkos::complex<double>(23.0, 10.0);
+    h_z(6)  = Kokkos::complex<double>(23.0, -10.0);
+    h_z(7)  = Kokkos::complex<double>(-23.0, 10.0);
+    h_z(8)  = Kokkos::complex<double>(-23.0, -10.0);
+    h_z(9)  = Kokkos::complex<double>(3.0, 0.0);
+    h_z(10) = Kokkos::complex<double>(-3.0, 0.0);
+    h_z(11) = Kokkos::complex<double>(23.0, 0.0);
+    h_z(12) = Kokkos::complex<double>(-23.0, 0.0);
+    h_z(13) = Kokkos::complex<double>(28.0, 10.0);
+    h_z(14) = Kokkos::complex<double>(28.0, -10.0);
+    h_z(15) = Kokkos::complex<double>(-28.0, 10.0);
+    h_z(16) = Kokkos::complex<double>(-28.0, -10.0);
+    h_z(17) = Kokkos::complex<double>(200.0, 60.0);
+    h_z(18) = Kokkos::complex<double>(200.0, -60.0);
+    h_z(19) = Kokkos::complex<double>(-200.0, 60.0);
+    h_z(20) = Kokkos::complex<double>(-200.0, -60.0);
+    h_z(21) = Kokkos::complex<double>(28.0, 0.0);
+    h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
+    h_z(23) = Kokkos::complex<double>(200.0, 0.0);
+    h_z(24) = Kokkos::complex<double>(-200.0, 0.0);
+
+    Kokkos::deep_copy(d_z, h_z);
+
+    // Call Hankel functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_ch20, d_ch20);
+    Kokkos::deep_copy(h_ch21, d_ch21);
+
+    // Reference values computed with Octave
+    h_ref_ch20(0) = Kokkos::complex<double>(1.0, infinity<double>::value);
+    h_ref_ch20(1) =
+        Kokkos::complex<double>(-2.480676488910849e+00, -1.948786988612626e+00);
+    h_ref_ch20(2) =
+        Kokkos::complex<double>(-1.779327030399459e-02, -5.281940449715537e-02);
+    h_ref_ch20(3) =
+        Kokkos::complex<double>(-2.516263029518839e+00, +1.843148179618315e+00);
+    h_ref_ch20(4) =
+        Kokkos::complex<double>(1.779327030399459e-02, -5.281940449715537e-02);
+    h_ref_ch20(5) =
+        Kokkos::complex<double>(-3.204879955218674e+03, +1.446133490498241e+03);
+    h_ref_ch20(6) =
+        Kokkos::complex<double>(-7.217716938222564e-06, +1.002796203581228e-07);
+    h_ref_ch20(7) =
+        Kokkos::complex<double>(-3.204879969654108e+03, -1.446133490297682e+03);
+    h_ref_ch20(8) =
+        Kokkos::complex<double>(7.217716938222564e-06, +1.002796203581228e-07);
+    h_ref_ch20(9) =
+        Kokkos::complex<double>(-2.600519549019334e-01, -3.768500100127903e-01);
+    h_ref_ch20(10) =
+        Kokkos::complex<double>(-7.801558647058006e-01, -3.768500100127903e-01);
+    h_ref_ch20(11) =
+        Kokkos::complex<double>(-1.624127813134865e-01, +3.598179027370283e-02);
+    h_ref_ch20(12) =
+        Kokkos::complex<double>(-4.872383439404597e-01, +3.598179027370281e-02);
+    h_ref_ch20(13) =
+        Kokkos::complex<double>(-2.025824374843011e+03, -2.512479278555672e+03);
+    h_ref_ch20(14) =
+        Kokkos::complex<double>(-2.184905481759440e-06, -6.263387166445335e-06);
+    h_ref_ch20(15) =
+        Kokkos::complex<double>(-2.025824379212821e+03, +2.512479266028897e+03);
+    h_ref_ch20(16) =
+        Kokkos::complex<double>(2.184905481759440e-06, -6.263387166445335e-06);
+    h_ref_ch20(17) =
+        Kokkos::complex<double>(-8.261945332108929e+23, +6.252486138159269e+24);
+    h_ref_ch20(18) =
+        Kokkos::complex<double>(-1.983689762743337e-28, +4.408449940359881e-28);
+    h_ref_ch20(19) =
+        Kokkos::complex<double>(-8.261945332108929e+23, -6.252486138159269e+24);
+    h_ref_ch20(20) =
+        Kokkos::complex<double>(1.983689762743337e-28, +4.408449940359881e-28);
+    h_ref_ch20(21) =
+        Kokkos::complex<double>(-7.315701054899959e-02, -1.318364704235323e-01);
+    h_ref_ch20(22) =
+        Kokkos::complex<double>(-2.194710316469988e-01, -1.318364704235323e-01);
+    h_ref_ch20(23) =
+        Kokkos::complex<double>(-1.543743993056510e-02, +5.426577524981793e-02);
+    h_ref_ch20(24) =
+        Kokkos::complex<double>(-4.631231979169528e-02, +5.426577524981793e-02);
+
+    h_ref_ch21(0) = Kokkos::complex<double>(0.0, infinity<double>::value);
+    h_ref_ch21(1) =
+        Kokkos::complex<double>(1.505230101821194e+00, -2.546831401702448e+00);
+    h_ref_ch21(2) =
+        Kokkos::complex<double>(5.506759533731469e-02, -2.486728122475093e-02);
+    h_ref_ch21(3) =
+        Kokkos::complex<double>(-1.615365292495823e+00, -2.497096839252946e+00);
+    h_ref_ch21(4) =
+        Kokkos::complex<double>(5.506759533731469e-02, +2.486728122475093e-02);
+    h_ref_ch21(5) =
+        Kokkos::complex<double>(-1.493895250453947e+03, -3.153217017782819e+03);
+    h_ref_ch21(6) =
+        Kokkos::complex<double>(-2.319863729607219e-07, -7.274197719836158e-06);
+    h_ref_ch21(7) =
+        Kokkos::complex<double>(1.493895250917918e+03, -3.153217003234423e+03);
+    h_ref_ch21(8) =
+        Kokkos::complex<double>(-2.319863729607210e-07, +7.274197719836158e-06);
+    h_ref_ch21(9) =
+        Kokkos::complex<double>(3.390589585259364e-01, -3.246744247918000e-01);
+    h_ref_ch21(10) =
+        Kokkos::complex<double>(-1.017176875577809e+00, +3.246744247918000e-01);
+    h_ref_ch21(11) =
+        Kokkos::complex<double>(-3.951932188370152e-02, -1.616692009926331e-01);
+    h_ref_ch21(12) =
+        Kokkos::complex<double>(1.185579656511045e-01, +1.616692009926332e-01);
+    h_ref_ch21(13) =
+        Kokkos::complex<double>(2.466294194249553e+03, -2.054604534104335e+03);
+    h_ref_ch21(14) =
+        Kokkos::complex<double>(6.265071091331731e-06, -2.296112637347948e-06);
+    h_ref_ch21(15) =
+        Kokkos::complex<double>(-2.466294206779695e+03, -2.054604529512110e+03);
+    h_ref_ch21(16) =
+        Kokkos::complex<double>(6.265071091331731e-06, +2.296112637347947e-06);
+    h_ref_ch21(17) =
+        Kokkos::complex<double>(-6.250095237987940e+24, -8.112776606830997e+23);
+    h_ref_ch21(18) =
+        Kokkos::complex<double>(-4.416040381930448e-28, -1.974955285825768e-28);
+    h_ref_ch21(19) =
+        Kokkos::complex<double>(6.250095237987940e+24, -8.112776606831005e+23);
+    h_ref_ch21(20) =
+        Kokkos::complex<double>(-4.416040381930448e-28, +1.974955285825769e-28);
+    h_ref_ch21(21) =
+        Kokkos::complex<double>(1.305514883350938e-01, -7.552212658226459e-02);
+    h_ref_ch21(22) =
+        Kokkos::complex<double>(-3.916544650052814e-01, +7.552212658226461e-02);
+    h_ref_ch21(23) =
+        Kokkos::complex<double>(-5.430453818237824e-02, -1.530182458038999e-02);
+    h_ref_ch21(24) =
+        Kokkos::complex<double>(1.629136145471347e-01, +1.530182458039000e-02);
+
+    EXPECT_EQ(h_ref_ch20(0), h_ch20(0));
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_ch20(i) - h_ref_ch20(i)),
+                Kokkos::abs(h_ref_ch20(i)) * 1e-13);
+    }
+
+    EXPECT_EQ(h_ref_ch21(0), h_ch21(0));
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_ch21(i) - h_ref_ch21(i)),
+                Kokkos::abs(h_ref_ch21(i)) * 1e-13);
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_ch20(i) = Kokkos::Experimental::cyl_bessel_h20(d_z(i));
+    d_ch21(i) = Kokkos::Experimental::cyl_bessel_h21(d_z(i));
+  }
+};
+
+TEST(TEST_CATEGORY, mathspecialfunc_expint1) {
+  TestExponentialIntergral1Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_errorfunc) {
+  TestComplexErrorFunction<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_cbesselj0y0) {
+  TestComplexBesselJ0Y0Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_cbesselj1y1) {
+  TestComplexBesselJ1Y1Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_cbesseli0k0) {
+  TestComplexBesselI0K0Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_cbesseli1k1) {
+  TestComplexBesselI1K1Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_cbesselh1stkind) {
+  TestComplexBesselH1Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_cbesselh2ndkind) {
+  TestComplexBesselH2Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+}  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestMemoryPool.hpp b/packages/kokkos/core/unit_test/TestMemoryPool.hpp
index 63895ad47dc435c98201a2b46d8b439d2a50ad51..829e8d641a5b00a0be67200bdf30495951e95457 100644
--- a/packages/kokkos/core/unit_test/TestMemoryPool.hpp
+++ b/packages/kokkos/core/unit_test/TestMemoryPool.hpp
@@ -50,7 +50,7 @@
 #include <cmath>
 #include <algorithm>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 namespace TestMemoryPool {
 
diff --git a/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp b/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp
index 6c8a47a5861dd361364a94551abcfd50d0e85153..d7607c4f71b0df4f32af77f4441c8c909992b14a 100644
--- a/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp
+++ b/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp
@@ -48,7 +48,7 @@
 
 #include <Kokkos_Core.hpp>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <iostream>
 #include <cstdlib>
 #include <cstdint>
@@ -310,6 +310,46 @@ struct array_reduce {
     return lsum;
   }
 };
+
+struct point_t {
+  uint8_t x, y, z;
+
+  KOKKOS_FUNCTION
+  point_t() : x(1), y(1), z(1){};
+
+  KOKKOS_FUNCTION
+  point_t(const point_t &val) : x(val.x), y(val.y), z(val.z){};
+
+  KOKKOS_FUNCTION
+  point_t(const volatile point_t &val) : x(val.x), y(val.y), z(val.z){};
+
+  KOKKOS_FUNCTION
+  point_t(const int rhs) { x = y = z = static_cast<uint8_t>(rhs); }
+
+  KOKKOS_FUNCTION
+  explicit operator int() const { return static_cast<int>(x + y + z); }
+
+  KOKKOS_FUNCTION
+  bool operator==(const volatile point_t rhs) const volatile {
+    return (x == rhs.x && y == rhs.y && z == rhs.z);
+  }
+
+  KOKKOS_FUNCTION
+  void operator=(point_t rhs) volatile {
+    x = rhs.x;
+    y = rhs.y;
+    z = rhs.z;
+  }
+
+  KOKKOS_FUNCTION
+  volatile point_t operator+=(const volatile point_t rhs) volatile {
+    x += rhs.x;
+    y += rhs.y;
+    z += rhs.z;
+    return *this;
+  }
+};
+
 }  // namespace Test
 
 namespace Kokkos {
@@ -334,5 +374,21 @@ struct reduction_identity<Test::array_reduce<scalar_t, N>> {
     return Test::array_reduce<scalar_t, N>(t_red_ident::prod());
   }
 };
+
+template <>
+struct reduction_identity<Test::point_t> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static uint8_t sum() noexcept {
+    return 0;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static uint8_t prod() noexcept {
+    return 1;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static uint8_t max() noexcept {
+    return 0xff;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static uint8_t min() noexcept {
+    return 0x0;
+  }
+};
 }  // namespace Kokkos
 #endif  // TESTNONTRIVIALSCALARTYPES_HPP_
diff --git a/packages/kokkos/core/unit_test/TestNumericTraits.hpp b/packages/kokkos/core/unit_test/TestNumericTraits.hpp
index fe01b83834f26eddc15e71360d77e85452ef0238..cb69cb83211e7b82f941e544b0498da6df737cf4 100644
--- a/packages/kokkos/core/unit_test/TestNumericTraits.hpp
+++ b/packages/kokkos/core/unit_test/TestNumericTraits.hpp
@@ -46,6 +46,7 @@
 
 #include <Kokkos_Core.hpp>
 #include <type_traits>
+#include <limits>
 #include "Kokkos_NumericTraits.hpp"
 #include "Kokkos_ExecPolicy.hpp"
 
@@ -198,7 +199,9 @@ struct TestNumericTraits<
 TEST(TEST_CATEGORY, numeric_traits_infinity) {
   TestNumericTraits<TEST_EXECSPACE, float, Infinity>();
   TestNumericTraits<TEST_EXECSPACE, double, Infinity>();
+#ifndef KOKKOS_COMPILER_IBM  // fails with XL 16.1.1 see issue #4100
   TestNumericTraits<TEST_EXECSPACE, long double, Infinity>();
+#endif
 }
 
 TEST(TEST_CATEGORY, numeric_traits_epsilon) {
@@ -334,3 +337,182 @@ TEST(TEST_CATEGORY, numeric_traits_min_max_exponent10) {
   TestNumericTraits<TEST_EXECSPACE, long double, MinExponent10>();
   TestNumericTraits<TEST_EXECSPACE, long double, MaxExponent10>();
 }
+
+namespace NumericTraitsSFINAE {
+
+struct HasNoSpecialization {};
+
+#define CHECK_TRAIT_IS_SFINAE_FRIENDLY(TRAIT)                              \
+  template <class T>                                                       \
+  using TRAIT##_value_t = decltype(Kokkos::Experimental::TRAIT<T>::value); \
+  template <class T>                                                       \
+  using has_##TRAIT = Kokkos::is_detected<TRAIT##_value_t, T>;             \
+  static_assert(!has_##TRAIT<HasNoSpecialization>::value, "");
+
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(infinity)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(finite_min)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(finite_max)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(epsilon)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(round_error)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(norm_min)
+
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(digits)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(digits10)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(max_digits10)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(radix)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(min_exponent)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(min_exponent10)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(max_exponent)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(max_exponent10)
+
+}  // namespace NumericTraitsSFINAE
+
+// Example detecting presence or absence of values
+template <class T>
+using infinity_value_t = decltype(Kokkos::Experimental::infinity<T>::value);
+
+template <class T>
+using has_infinity = Kokkos::is_detected<infinity_value_t, T>;
+
+template <class T, std::enable_if_t<has_infinity<T>::value>* = nullptr>
+constexpr T legacy_std_numeric_limits_infinity() {
+  return Kokkos::Experimental::infinity<T>::value;
+}
+
+template <class T, std::enable_if_t<!has_infinity<T>::value>* = nullptr>
+constexpr T legacy_std_numeric_limits_infinity() {
+  return T();
+}
+
+TEST(TEST_CATEGORY, numeric_traits_sfinae_friendly) {
+  ASSERT_EQ(legacy_std_numeric_limits_infinity<int>(), 0);
+}
+
+// Compare to std::numeric_limits
+template <int V1, int V2>
+struct AssertIntEquality {
+  static constexpr bool value = false;
+};
+template <int V>
+struct AssertIntEquality<V, V> {
+  static constexpr bool value = true;
+};
+#define CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(T, TRAIT)           \
+  static_assert(AssertIntEquality<Kokkos::Experimental::TRAIT<T>::value, \
+                                  std::numeric_limits<T>::TRAIT>::value, \
+                "")
+#define CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(T, TRAIT) \
+  static_assert(Kokkos::Experimental::TRAIT<T>::value ==       \
+                    std::numeric_limits<T>::TRAIT(),           \
+                "")
+
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(float, infinity);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(double, infinity);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, infinity);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(float, epsilon);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(double, epsilon);
+#ifndef KOKKOS_COMPILER_IBM  // fails with XL 16.1.1
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, epsilon);
+#endif
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(float, round_error);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(double, round_error);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, round_error);
+// clang-format off
+static_assert(Kokkos::Experimental::norm_min<float      >::value == std::numeric_limits<      float>::min(), "");
+static_assert(Kokkos::Experimental::norm_min<double     >::value == std::numeric_limits<     double>::min(), "");
+static_assert(Kokkos::Experimental::norm_min<long double>::value == std::numeric_limits<long double>::min(), "");
+// integer types
+static_assert(Kokkos::Experimental::finite_min<char                  >::value == std::numeric_limits<                  char>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<signed char           >::value == std::numeric_limits<           signed char>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<unsigned char         >::value == std::numeric_limits<         unsigned char>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<short                 >::value == std::numeric_limits<                 short>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<unsigned short        >::value == std::numeric_limits<        unsigned short>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<int                   >::value == std::numeric_limits<                   int>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<unsigned int          >::value == std::numeric_limits<          unsigned int>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<long int              >::value == std::numeric_limits<              long int>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<unsigned long int     >::value == std::numeric_limits<     unsigned long int>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<long long int         >::value == std::numeric_limits<         long long int>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<unsigned long long int>::value == std::numeric_limits<unsigned long long int>::min(), "");
+static_assert(Kokkos::Experimental::finite_max<char                  >::value == std::numeric_limits<                  char>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<signed char           >::value == std::numeric_limits<           signed char>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<unsigned char         >::value == std::numeric_limits<         unsigned char>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<short                 >::value == std::numeric_limits<                 short>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<unsigned short        >::value == std::numeric_limits<        unsigned short>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<int                   >::value == std::numeric_limits<                   int>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<unsigned int          >::value == std::numeric_limits<          unsigned int>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<long int              >::value == std::numeric_limits<              long int>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<unsigned long int     >::value == std::numeric_limits<     unsigned long int>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<long long int         >::value == std::numeric_limits<         long long int>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<unsigned long long int>::value == std::numeric_limits<unsigned long long int>::max(), "");
+// floating point types
+static_assert(Kokkos::Experimental::finite_min<float      >::value == -std::numeric_limits<      float>::max(), "");
+static_assert(Kokkos::Experimental::finite_min<double     >::value == -std::numeric_limits<     double>::max(), "");
+static_assert(Kokkos::Experimental::finite_min<long double>::value == -std::numeric_limits<long double>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<float      >::value ==  std::numeric_limits<      float>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<double     >::value ==  std::numeric_limits<     double>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<long double>::value ==  std::numeric_limits<long double>::max(), "");
+// clang-format on
+
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(bool, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(char, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(signed char, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned char, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(short, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned short, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(int, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned int, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long int, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned long int, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long long int, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned long long int, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(bool, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(char, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(signed char, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned char, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(short, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned short, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(int, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned int, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long int, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned long int, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long long int, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned long long int, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, max_digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, max_digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, max_digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(bool, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(char, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(signed char, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned char, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(short, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned short, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(int, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned int, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long int, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned long int, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long long int, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned long long int, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, min_exponent);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, max_exponent);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, min_exponent);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, max_exponent);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, min_exponent);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, max_exponent);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, min_exponent10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, max_exponent10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, min_exponent10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, max_exponent10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, min_exponent10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, max_exponent10);
+
+#undef CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION
+#undef CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT
diff --git a/packages/kokkos/core/unit_test/TestPolicyConstruction.hpp b/packages/kokkos/core/unit_test/TestPolicyConstruction.hpp
index 0017c690e75c6e1bde1808e87203d8dbbea754cc..d75d78b31f08f3d6234a174053630abace0d781a 100644
--- a/packages/kokkos/core/unit_test/TestPolicyConstruction.hpp
+++ b/packages/kokkos/core/unit_test/TestPolicyConstruction.hpp
@@ -291,34 +291,34 @@ class TestRangePolicyConstruction {
     using policy_t = Kokkos::RangePolicy<>;
     {
       policy_t p(5, 15);
-      ASSERT_TRUE((p.begin() == 5));
-      ASSERT_TRUE((p.end() == 15));
+      ASSERT_EQ(p.begin(), 5);
+      ASSERT_EQ(p.end(), 15);
     }
     {
       policy_t p(Kokkos::DefaultExecutionSpace(), 5, 15);
-      ASSERT_TRUE((p.begin() == 5));
-      ASSERT_TRUE((p.end() == 15));
+      ASSERT_EQ(p.begin(), 5);
+      ASSERT_EQ(p.end(), 15);
     }
     {
       policy_t p(5, 15, Kokkos::ChunkSize(10));
-      ASSERT_TRUE((p.begin() == 5));
-      ASSERT_TRUE((p.end() == 15));
-      ASSERT_TRUE((p.chunk_size() == 10));
+      ASSERT_EQ(p.begin(), 5);
+      ASSERT_EQ(p.end(), 15);
+      ASSERT_EQ(p.chunk_size(), 10);
     }
     {
       policy_t p(Kokkos::DefaultExecutionSpace(), 5, 15, Kokkos::ChunkSize(10));
-      ASSERT_TRUE((p.begin() == 5));
-      ASSERT_TRUE((p.end() == 15));
-      ASSERT_TRUE((p.chunk_size() == 10));
+      ASSERT_EQ(p.begin(), 5);
+      ASSERT_EQ(p.end(), 15);
+      ASSERT_EQ(p.chunk_size(), 10);
     }
     {
       policy_t p;
-      ASSERT_TRUE((p.begin() == 0));
-      ASSERT_TRUE((p.end() == 0));
+      ASSERT_EQ(p.begin(), 0);
+      ASSERT_EQ(p.end(), 0);
       p = policy_t(5, 15, Kokkos::ChunkSize(10));
-      ASSERT_TRUE((p.begin() == 5));
-      ASSERT_TRUE((p.end() == 15));
-      ASSERT_TRUE((p.chunk_size() == 10));
+      ASSERT_EQ(p.begin(), 5);
+      ASSERT_EQ(p.end(), 15);
+      ASSERT_EQ(p.chunk_size(), 10);
     }
   }
 };
@@ -582,7 +582,7 @@ class TestTeamPolicyConstruction {
     ASSERT_EQ(p1.team_size(), team_size);
 // FIXME_SYCL implement chunk_size
 #ifndef KOKKOS_ENABLE_SYCL
-    ASSERT_TRUE(p1.chunk_size() > 0);
+    ASSERT_GT(p1.chunk_size(), 0);
 #endif
     ASSERT_EQ(p1.scratch_size(0), 0);
 
@@ -795,7 +795,7 @@ TEST(TEST_CATEGORY, desired_occupancy_empty_base_optimization) {
   static_assert(sizeof(decltype(policy)) == 1, "");
   static_assert_dummy_policy_must_be_size_one<sizeof(decltype(policy))>
       _assert1{};
-  (void)_assert1;  // avoid unused variable warning
+  (void)&_assert1;  // avoid unused variable warning
 
   using Kokkos::Experimental::DesiredOccupancy;
   auto policy_with_occ =
@@ -805,7 +805,7 @@ TEST(TEST_CATEGORY, desired_occupancy_empty_base_optimization) {
   static_assert_dummy_policy_must_be_size_of_desired_occupancy<
       sizeof(decltype(policy_with_occ)), sizeof(DesiredOccupancy)>
       _assert2{};
-  (void)_assert2;  // avoid unused variable warning
+  (void)&_assert2;  // avoid unused variable warning
 }
 
 template <typename Policy>
diff --git a/packages/kokkos/core/unit_test/TestQuadPrecisionMath.hpp b/packages/kokkos/core/unit_test/TestQuadPrecisionMath.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e45d84e7e05b5beaed658fff20201968fd0d1050
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestQuadPrecisionMath.hpp
@@ -0,0 +1,109 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_LIBQUADMATH
+
+#include <impl/Kokkos_QuadPrecisionMath.hpp>
+#include <Kokkos_Core.hpp>
+
+#include <gtest/gtest.h>
+
+// FIXME instantiate only once for default host execution space
+TEST(TEST_CATEGORY, quad_precision_reductions) {
+  int const n = 100;
+  __float128 r;
+
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, n),
+      KOKKOS_LAMBDA(int i, __float128 &v) { v += static_cast<__float128>(i); },
+      r);
+  EXPECT_EQ(r, n * (n - 1) / 2);
+
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, n),
+      KOKKOS_LAMBDA(int i, __float128 &v) { v += static_cast<__float128>(i); },
+      Kokkos::Sum<__float128>(r));
+  EXPECT_EQ(r, n * (n - 1) / 2);
+
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, n),
+      KOKKOS_LAMBDA(int i, __float128 &v) {
+        if (v > static_cast<__float128>(i)) {
+          v = static_cast<__float128>(i);
+        }
+      },
+      Kokkos::Min<__float128>(r));
+  EXPECT_EQ(r, 0);
+
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, n),
+      KOKKOS_LAMBDA(int i, __float128 &v) {
+        if (v < static_cast<__float128>(i)) {
+          v = static_cast<__float128>(i);
+        }
+      },
+      Kokkos::Max<__float128>(r));
+  EXPECT_EQ(r, n - 1);
+
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(1, n),
+      KOKKOS_LAMBDA(int i, __float128 &v) { v *= static_cast<__float128>(i); },
+      Kokkos::Prod<__float128>(r));
+  EXPECT_FLOAT_EQ(r, tgammaq(n + 1));  // factorial(n) = tgamma(n+1)
+}
+
+TEST(TEST_CATEGORY, quad_precision_common_math_functions) {
+  Kokkos::parallel_for(
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, 1),
+      KOKKOS_LAMBDA(int) {
+        (void)Kokkos::Experimental::fabs((__float128)0);
+        (void)Kokkos::Experimental::sqrt((__float128)1);
+        (void)Kokkos::Experimental::exp((__float128)2);
+        (void)Kokkos::Experimental::sin((__float128)3);
+        (void)Kokkos::Experimental::cosh((__float128)4);
+      });
+}
+
+#endif
diff --git a/packages/kokkos/core/unit_test/TestRange.hpp b/packages/kokkos/core/unit_test/TestRange.hpp
index a6a6220f2dceea470414fb0d712796689f6d151c..d6b5d8fecc86173d3fa438cbca5b8242b48ddb36 100644
--- a/packages/kokkos/core/unit_test/TestRange.hpp
+++ b/packages/kokkos/core/unit_test/TestRange.hpp
@@ -317,10 +317,10 @@ struct TestRange {
           if (count(t) < min) min = count(t);
           if (count(t) > max) max = count(t);
         }
-        ASSERT_TRUE(min < max);
+        ASSERT_LT(min, max);
 
         // if ( ExecSpace::concurrency() > 2 ) {
-        //  ASSERT_TRUE( 2 * min < max );
+        //  ASSERT_LT( 2 * min, max );
         //}
       }
     }
@@ -361,10 +361,10 @@ struct TestRange {
           if (count(t) < min) min = count(t);
           if (count(t) > max) max = count(t);
         }
-        ASSERT_TRUE(min < max);
+        ASSERT_LT(min, max);
 
         // if ( ExecSpace::concurrency() > 2 ) {
-        //  ASSERT_TRUE( 2 * min < max );
+        //  ASSERT_LT( 2 * min, max );
         //}
       }
     }
diff --git a/packages/kokkos/core/unit_test/TestRangePolicyRequire.hpp b/packages/kokkos/core/unit_test/TestRangePolicyRequire.hpp
index 693f19613db6beb8c1c2a551574808de26633726..508b7192cb29aa87d3d28930d0babe0be1024432 100644
--- a/packages/kokkos/core/unit_test/TestRangePolicyRequire.hpp
+++ b/packages/kokkos/core/unit_test/TestRangePolicyRequire.hpp
@@ -309,10 +309,10 @@ struct TestRangeRequire {
           if (count(t) < min) min = count(t);
           if (count(t) > max) max = count(t);
         }
-        ASSERT_TRUE(min < max);
+        ASSERT_LT(min, max);
 
         // if ( ExecSpace::concurrency() > 2 ) {
-        //  ASSERT_TRUE( 2 * min < max );
+        //  ASSERT_LT( 2 * min, max );
         //}
       }
     }
@@ -353,10 +353,10 @@ struct TestRangeRequire {
           if (count(t) < min) min = count(t);
           if (count(t) > max) max = count(t);
         }
-        ASSERT_TRUE(min < max);
+        ASSERT_LT(min, max);
 
         // if ( ExecSpace::concurrency() > 2 ) {
-        //  ASSERT_TRUE( 2 * min < max );
+        //  ASSERT_LT( 2 * min, max );
         //}
       }
     }
diff --git a/packages/kokkos/core/unit_test/TestReduce.hpp b/packages/kokkos/core/unit_test/TestReduce.hpp
index 5f7fbd5623d6e8e4c25c261a0f092d79c1573fba..81e063f83e3ae4fba46f525756c262cb851d2068 100644
--- a/packages/kokkos/core/unit_test/TestReduce.hpp
+++ b/packages/kokkos/core/unit_test/TestReduce.hpp
@@ -539,6 +539,10 @@ class TestReduceDynamicView {
 
 }  // namespace
 
+// FIXME_OPENMPTARGET : The feature works with LLVM/13 on NVIDIA
+// architectures. The jenkins currently tests with LLVM/12.
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CLANG) && \
+    (KOKKOS_COMPILER_CLANG >= 1300)
 TEST(TEST_CATEGORY, int64_t_reduce) {
   TestReduce<int64_t, TEST_EXECSPACE>(0);
   TestReduce<int64_t, TEST_EXECSPACE>(1000000);
@@ -563,7 +567,10 @@ TEST(TEST_CATEGORY, int64_t_reduce_dynamic_view) {
   TestReduceDynamicView<int64_t, TEST_EXECSPACE>(0);
   TestReduceDynamicView<int64_t, TEST_EXECSPACE>(1000000);
 }
+#endif
 
+// FIXME_OPENMPTARGET: Not yet implemented.
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
 TEST(TEST_CATEGORY, int_combined_reduce) {
   using functor_type = CombinedReduceFunctorSameType<int64_t, TEST_EXECSPACE>;
   constexpr uint64_t nw = 1000;
@@ -626,4 +633,5 @@ TEST(TEST_CATEGORY, int_combined_reduce_mixed) {
   ASSERT_EQ(nsum, result2);
   ASSERT_EQ(nsum, result3_v());
 }
+#endif
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp b/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp
index 68e7d746dd91a68046c4d074884ef5aef7519427..4664f265594b858e8879e7d2faa3aca62d320a0d 100644
--- a/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp
+++ b/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp
@@ -439,11 +439,11 @@ struct TestReduceCombinatoricalInstantiation {
                        Test::ReduceCombinatorical::AddPlus<double>(value));
     if ((Kokkos::DefaultExecutionSpace::concurrency() > 1) &&
         (ExecSpace::concurrency() > 1) && (expected_result > 0)) {
-      ASSERT_TRUE(expected_result < value);
+      ASSERT_LT(expected_result, value);
     } else if (((Kokkos::DefaultExecutionSpace::concurrency() > 1) ||
                 (ExecSpace::concurrency() > 1)) &&
                (expected_result > 0)) {
-      ASSERT_TRUE(expected_result <= value);
+      ASSERT_LE(expected_result, value);
     } else {
       ASSERT_EQ(expected_result, value);
     }
@@ -453,11 +453,11 @@ struct TestReduceCombinatoricalInstantiation {
     CallParallelReduce(args..., add);
     if ((Kokkos::DefaultExecutionSpace::concurrency() > 1) &&
         (ExecSpace::concurrency() > 1) && (expected_result > 0)) {
-      ASSERT_TRUE(expected_result < value);
+      ASSERT_LT(expected_result, value);
     } else if (((Kokkos::DefaultExecutionSpace::concurrency() > 1) ||
                 (ExecSpace::concurrency() > 1)) &&
                (expected_result > 0)) {
-      ASSERT_TRUE(expected_result <= value);
+      ASSERT_LE(expected_result, value);
     } else {
       ASSERT_EQ(expected_result, value);
     }
diff --git a/packages/kokkos/core/unit_test/TestReducers.hpp b/packages/kokkos/core/unit_test/TestReducers.hpp
index 35f0e231fd2a7b1e88bbf4be568532aa5c219e3f..0d5f7fe7ba538524e0119c950f01469c7aa48a83 100644
--- a/packages/kokkos/core/unit_test/TestReducers.hpp
+++ b/packages/kokkos/core/unit_test/TestReducers.hpp
@@ -296,7 +296,8 @@ struct TestReducers {
     Scalar reference_sum = 0;
 
     for (int i = 0; i < N; i++) {
-      h_values(i) = (Scalar)(rand() % 100);
+      int denom   = sizeof(Scalar) <= 2 ? 10 : 100;
+      h_values(i) = (Scalar)(rand() % denom);
       reference_sum += h_values(i);
     }
     Kokkos::deep_copy(values, h_values);
diff --git a/packages/kokkos/core/unit_test/TestReducers_d.hpp b/packages/kokkos/core/unit_test/TestReducers_d.hpp
index e2254a1c1fe653b22c3e6b9a9ebad50d07a9eb89..2d5802cdd4fcde24e8ac1dfe0f8d42ba9eaf396b 100644
--- a/packages/kokkos/core/unit_test/TestReducers_d.hpp
+++ b/packages/kokkos/core/unit_test/TestReducers_d.hpp
@@ -64,4 +64,49 @@ TEST(TEST_CATEGORY, reducers_struct) {
   TestReducers<array_reduce<float, 7>, TEST_EXECSPACE>::test_sum(1031);
 #endif
 }
+
+TEST(TEST_CATEGORY, reducers_half_t) {
+  using ThisTestType = Kokkos::Experimental::half_t;
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(2);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(101);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(202);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(303);
+
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(5);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(10);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(15);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(20);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(25);
+}
+
+TEST(TEST_CATEGORY, reducers_int8_t) {
+  using ThisTestType = int8_t;
+
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(1);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(2);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(3);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(4);
+
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(1);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(2);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(3);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(4);
+}
+
+#if !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_OPENMPTARGET)
+// TODO - resolve: "Kokkos_HIP_Vectorization.hpp:80:15: error: call to
+//                 implicitly-deleted default constructor of 'conv_type'
+//                   conv_type tmp_in;"
+//
+// TODO - resolve:  4: [  FAILED  ] openmptarget.reducers_point_t (1 ms)
+TEST(TEST_CATEGORY, reducers_point_t) {
+  using ThisTestType = point_t;
+
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(1);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(2);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(3);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(4);
+}
+#endif  // !KOKKOS_ENABLE_HIP && !KOKKOS_ENABLE_OPENMPTARGET
+
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestReductions.hpp b/packages/kokkos/core/unit_test/TestReductions.hpp
index 949ca7eaf30a4746a8fec355f1b62c035c83d041..1fa8a2e92e68f7c3bf34e6cc4cc96b29b73071f3 100644
--- a/packages/kokkos/core/unit_test/TestReductions.hpp
+++ b/packages/kokkos/core/unit_test/TestReductions.hpp
@@ -45,8 +45,6 @@
 #ifndef KOKKOS_TEST_REDUCTIONS_HPP
 #define KOKKOS_TEST_REDUCTIONS_HPP
 #include <Kokkos_Macros.hpp>
-#ifndef KOKKOS_ENABLE_OPENMPTARGET
 #include <TestReduce.hpp>
-#endif
 #include <TestCXX11Deduction.hpp>
 #endif
diff --git a/packages/kokkos/core/unit_test/TestReductions_DeviceView.hpp b/packages/kokkos/core/unit_test/TestReductions_DeviceView.hpp
index 17563de335e5b6a6170985e392ea8ae0de5ae8c1..6ffa11b11ca2d639bd9fd930a733d41ae7950482 100644
--- a/packages/kokkos/core/unit_test/TestReductions_DeviceView.hpp
+++ b/packages/kokkos/core/unit_test/TestReductions_DeviceView.hpp
@@ -32,11 +32,17 @@ void test_reduce_device_view(int64_t N, PolicyType policy,
   typename ExecSpace::execution_space().fence();
   double time_fence0 = timer.seconds();
   Kokkos::deep_copy(result, 0);
+
+  // We need a warm-up to get reasonable results
+  Kokkos::parallel_reduce("Test::ReduceDeviceView::TestReducer", policy,
+                          functor,
+                          Kokkos::Sum<int64_t, TEST_EXECSPACE>(result));
+  Kokkos::fence();
+
   timer.reset();
   bool is_async = time0 < time_fence0;
 
   // Test Reducer
-
   Kokkos::parallel_reduce("Test::ReduceDeviceView::TestReducer", policy,
                           functor,
                           Kokkos::Sum<int64_t, TEST_EXECSPACE>(result));
@@ -75,11 +81,11 @@ void test_reduce_device_view(int64_t N, PolicyType policy,
 
   ASSERT_EQ(N, scalar_result);
   if (is_async) {
-    ASSERT_TRUE(time1 < time_fence1);
+    ASSERT_LT(time1, time_fence1);
   }
   if (is_async) {
-    ASSERT_TRUE(time2 < time_fence2);
-    ASSERT_TRUE(time3 > time_fence3);
+    ASSERT_LT(time2, time_fence2);
+    ASSERT_GT(time3, time_fence3);
   }
 }
 
@@ -128,8 +134,6 @@ TEST(TEST_CATEGORY, reduce_device_view_mdrange_policy) {
       MDRangePolicyFunctor());
 }
 
-// FIXME_HIP
-#ifndef KOKKOS_ENABLE_HIP
 TEST(TEST_CATEGORY, reduce_device_view_team_policy) {
 // FIXME_SYCL The number of workgroups on CUDA devices can not be larger than
 // 65535
@@ -145,5 +149,4 @@ TEST(TEST_CATEGORY, reduce_device_view_team_policy) {
       TeamPolicyFunctor(1024));
 #endif
 }
-#endif
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestStackTrace.hpp b/packages/kokkos/core/unit_test/TestStackTrace.hpp
index 284332f3f85e87b9f8fc030084ecc78448da4e38..d34d0f92e959277e7f6a66c0718ce381cd794e61 100644
--- a/packages/kokkos/core/unit_test/TestStackTrace.hpp
+++ b/packages/kokkos/core/unit_test/TestStackTrace.hpp
@@ -73,10 +73,10 @@ void test_stacktrace(bool bTerminate, bool bCustom = true) {
 
     if (bDynamic) {
       printf("test_f1: %s \n", foutput.c_str());
-      ASSERT_TRUE(std::string::npos != foutput.find("stacktrace_test_f1"));
+      ASSERT_NE(std::string::npos, foutput.find("stacktrace_test_f1"));
       for (auto x : {"stacktrace_test_f0", "stacktrace_test_f2",
                      "stacktrace_test_f3", "stacktrace_test_f4"}) {
-        ASSERT_TRUE(std::string::npos == foutput.find(x));
+        ASSERT_EQ(std::string::npos, foutput.find(x));
       }
     }
   }
@@ -92,7 +92,7 @@ void test_stacktrace(bool bTerminate, bool bCustom = true) {
                   foutput.find("Test::stacktrace_test_f1"));
       for (auto x : {"stacktrace_test_f0", "stacktrace_test_f2",
                      "stacktrace_test_f3", "stacktrace_test_f4"}) {
-        ASSERT_TRUE(std::string::npos == foutput.find(x));
+        ASSERT_EQ(std::string::npos, foutput.find(x));
       }
     }
   }
@@ -114,7 +114,7 @@ void test_stacktrace(bool bTerminate, bool bCustom = true) {
       std::string foutput = sstream.str();
       printf("test_f3: %s \n", foutput.c_str());
       for (auto x : {"stacktrace_test_f1", "stacktrace_test_f3"}) {
-        ASSERT_TRUE(std::string::npos != foutput.find(x));
+        ASSERT_NE(std::string::npos, foutput.find(x));
       }
     }
     // TODO make sure stacktrace_test_f2/4 don't show up
@@ -129,7 +129,7 @@ void test_stacktrace(bool bTerminate, bool bCustom = true) {
       std::string foutput = sstream.str();
       printf("demangled test_f3: %s \n", foutput.c_str());
       for (auto x : {"stacktrace_test_f1", "stacktrace_test_f3"}) {
-        ASSERT_TRUE(std::string::npos != foutput.find(x));
+        ASSERT_NE(std::string::npos, foutput.find(x));
       }
     }
 
diff --git a/packages/kokkos/core/unit_test/TestTeam.hpp b/packages/kokkos/core/unit_test/TestTeam.hpp
index 97ddfd4cf58518bfa494eedf4445ba68fdb1132a..a5e3de85bbc49508a2fe3c456860da9aa0b8af57 100644
--- a/packages/kokkos/core/unit_test/TestTeam.hpp
+++ b/packages/kokkos/core/unit_test/TestTeam.hpp
@@ -137,8 +137,10 @@ struct TestTeamPolicy {
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> none_auto(
         smallest_work, smallest_work, smallest_work);
 #endif
+    (void)none_auto;
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> both_auto(
         smallest_work, Kokkos::AUTO(), Kokkos::AUTO());
+    (void)both_auto;
     // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_vector(smallest_work, 32,
@@ -147,8 +149,10 @@ struct TestTeamPolicy {
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_vector(
         smallest_work, smallest_work, Kokkos::AUTO());
 #endif
+    (void)auto_vector;
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_team(
         smallest_work, Kokkos::AUTO(), smallest_work);
+    (void)auto_team;
   }
 
   static void test_for(const size_t league_size) {
@@ -970,7 +974,11 @@ struct ClassNoShmemSizeFunction {
                 double *, ExecSpace,
                 Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(1600);
 
-    int team_size = 8;
+#ifdef KOKKOS_ENABLE_SYCL
+    int team_size = 4;
+#else
+    int team_size      = 8;
+#endif
     if (team_size > ExecSpace::concurrency())
       team_size = ExecSpace::concurrency();
     {
@@ -1115,7 +1123,11 @@ void test_team_mulit_level_scratch_test_lambda() {
       Kokkos::View<double *, ExecSpace,
                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(1600);
 
+#ifdef KOKKOS_ENABLE_SYCL
+  int team_size = 4;
+#else
   int team_size = 8;
+#endif
   if (team_size > ExecSpace::concurrency())
     team_size = ExecSpace::concurrency();
 
@@ -1400,7 +1412,7 @@ struct TestTeamBroadcast<
     // above because the functor switches it back.
     bool setValue = ((lid % ts) != tid);
 
-    teamMember.team_broadcast([&](value_type &var) { var *= 2; }, value,
+    teamMember.team_broadcast([&](value_type &var) { var += var; }, value,
                               lid % ts);
     teamMember.team_broadcast([&](bool &bVar) { bVar = !bVar; }, setValue,
                               lid % ts);
@@ -1465,7 +1477,7 @@ struct TestTeamBroadcast<
     value_type expected_result = 0;
     for (unsigned int i = 0; i < league_size; i++) {
       value_type val =
-          (value_type((i % team_size) * 3) + off) * (value_type)team_size;
+          (value_type((i % team_size) * 3) + off) * value_type(team_size);
       expected_result += val;
     }
     // For comparison purposes treat the reduction as a random walk in the
diff --git a/packages/kokkos/core/unit_test/TestTeamBasic.hpp b/packages/kokkos/core/unit_test/TestTeamBasic.hpp
index 87c010ac2a0c5701916049532a715c6a5addce15..17899f63b1f7816cff75a34ccdce0b42d0ee1b3e 100644
--- a/packages/kokkos/core/unit_test/TestTeamBasic.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamBasic.hpp
@@ -105,6 +105,75 @@ TEST(TEST_CATEGORY, team_broadcast_long) {
                     long>::test_teambroadcast(1000, 1);
 }
 
+// FIXME_OPENMPTARGET CI fails with
+// Libomptarget error: Copying data from device failed.
+// Possibly, because long_wrapper is not trivially-copyable.
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
+struct long_wrapper {
+  long value;
+
+  KOKKOS_FUNCTION
+  long_wrapper() : value(0) {}
+
+  KOKKOS_FUNCTION
+  long_wrapper(long val) : value(val) {}
+
+  KOKKOS_FUNCTION
+  friend void operator+=(long_wrapper& lhs, const long_wrapper& rhs) {
+    lhs.value += rhs.value;
+  }
+
+  KOKKOS_FUNCTION
+  friend void operator+=(volatile long_wrapper& lhs,
+                         const volatile long_wrapper& rhs) {
+    lhs.value += rhs.value;
+  }
+
+  KOKKOS_FUNCTION
+  void operator=(const long_wrapper& other) { value = other.value; }
+
+  KOKKOS_FUNCTION
+  void operator=(const volatile long_wrapper& other) volatile {
+    value = other.value;
+  }
+  KOKKOS_FUNCTION
+  operator long() const { return value; }
+};
+}  // namespace Test
+
+namespace Kokkos {
+template <>
+struct reduction_identity<Test::long_wrapper>
+    : public reduction_identity<long> {};
+}  // namespace Kokkos
+
+namespace Test {
+
+// Test for non-arithmetic type
+TEST(TEST_CATEGORY, team_broadcast_long_wrapper) {
+  static_assert(!std::is_arithmetic<long_wrapper>::value, "");
+
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
+                    long_wrapper>::test_teambroadcast(0, 1);
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
+                    long_wrapper>::test_teambroadcast(0, 1);
+
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
+                    long_wrapper>::test_teambroadcast(2, 1);
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
+                    long_wrapper>::test_teambroadcast(2, 1);
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
+                    long_wrapper>::test_teambroadcast(16, 1);
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
+                    long_wrapper>::test_teambroadcast(16, 1);
+
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
+                    long_wrapper>::test_teambroadcast(1000, 1);
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
+                    long_wrapper>::test_teambroadcast(1000, 1);
+}
+#endif
+
 TEST(TEST_CATEGORY, team_broadcast_char) {
   {
     TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
diff --git a/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp b/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp
index 3db0eafa339de221a8dad8feb3cf7b3fa62027f2..836134afe0cd4d537520b12c80dd4efaafc21f38 100644
--- a/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp
@@ -53,14 +53,8 @@ TEST(TEST_CATEGORY, team_reduction_scan) {
   TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(0);
   TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >(10);
   TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(10);
-// FIXME_HIP
-#ifdef KOKKOS_ENABLE_HIP
-  if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value)
-#endif
-  {
-    TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >(10000);
-    TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(10000);
-  }
+  TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >(10000);
+  TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(10000);
 }
 
 TEST(TEST_CATEGORY, team_long_reduce) {
diff --git a/packages/kokkos/core/unit_test/TestTeamScratch.hpp b/packages/kokkos/core/unit_test/TestTeamScratch.hpp
index 75ca3587629ded5f5cc2dd2f3b8ef6623e8a07f7..bab937273ddee06bb55b17b4fefc567c98bac30b 100644
--- a/packages/kokkos/core/unit_test/TestTeamScratch.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamScratch.hpp
@@ -54,15 +54,8 @@ TEST(TEST_CATEGORY, team_shared_request) {
 }
 
 TEST(TEST_CATEGORY, team_scratch_request) {
-  // FIXME_HIP the parallel_reduce in this test requires a team size larger than
-  // 256. Fixed in ROCm 3.9
-#if defined(KOKKOS_ENABLE_HIP) && (HIP_VERSION < 309)
-  if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value)
-#endif
-  {
-    TestScratchTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
-    TestScratchTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
-  }
+  TestScratchTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestScratchTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
@@ -78,21 +71,14 @@ TEST(TEST_CATEGORY, scratch_align) { TestScratchAlignment<TEST_EXECSPACE>(); }
 TEST(TEST_CATEGORY, shmem_size) { TestShmemSize<TEST_EXECSPACE>(); }
 
 TEST(TEST_CATEGORY, multi_level_scratch) {
-  // FIXME_HIP the parallel_for and the parallel_reduce in this test requires a
-  // team size larger than 256. Fixed In ROCm 3.9
   // FIXME_OPENMPTARGET This unit test needs ~350KB of scratch memory for L0 and
   // L1 combined per team. Currently OpenMPTarget cannot allocate this high
   // amount of scratch memory.
 #if !defined(KOKKOS_ENABLE_OPENMPTARGET)
-#if defined(KOKKOS_ENABLE_HIP) && (HIP_VERSION < 309)
-  if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value)
-#endif
-  {
-    TestMultiLevelScratchTeam<TEST_EXECSPACE,
-                              Kokkos::Schedule<Kokkos::Static> >();
-    TestMultiLevelScratchTeam<TEST_EXECSPACE,
-                              Kokkos::Schedule<Kokkos::Dynamic> >();
-  }
+  TestMultiLevelScratchTeam<TEST_EXECSPACE,
+                            Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam<TEST_EXECSPACE,
+                            Kokkos::Schedule<Kokkos::Dynamic> >();
 #endif
 }
 
diff --git a/packages/kokkos/core/unit_test/TestTeamTeamSize.hpp b/packages/kokkos/core/unit_test/TestTeamTeamSize.hpp
index 992e80397bacb9b5dc9a0746ca2543a1792cce22..f64c5b8809a214d4e2376e43df29d7900eccd1de 100644
--- a/packages/kokkos/core/unit_test/TestTeamTeamSize.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamTeamSize.hpp
@@ -110,9 +110,9 @@ void test_team_policy_max_recommended_static_size(int scratch_size) {
   int team_size_rec_reduce = p.team_size_recommended(
       FunctorReduce<T, N, PolicyType, S>(), Kokkos::ParallelReduceTag());
 
-  ASSERT_TRUE(team_size_max_for >= team_size_rec_for);
-  ASSERT_TRUE(team_size_max_reduce >= team_size_rec_reduce);
-  ASSERT_TRUE(team_size_max_for >= team_size_max_reduce);
+  ASSERT_GE(team_size_max_for, team_size_rec_for);
+  ASSERT_GE(team_size_max_reduce, team_size_rec_reduce);
+  ASSERT_GE(team_size_max_for, team_size_max_reduce);
 
   Kokkos::parallel_for(PolicyType(10000, team_size_max_for, 4)
                            .set_scratch_size(0, Kokkos::PerTeam(scratch_size)),
@@ -122,13 +122,6 @@ void test_team_policy_max_recommended_static_size(int scratch_size) {
                        FunctorFor<T, N, PolicyType, S>());
   MyArray<T, N> val;
   double n_leagues = 10000;
-  // FIXME_HIP
-#ifdef KOKKOS_ENABLE_HIP
-  if (N == 2)
-    n_leagues = 1000;
-  else
-    n_leagues = 500;
-#endif
 
   Kokkos::parallel_reduce(
       PolicyType(n_leagues, team_size_max_reduce, 4)
diff --git a/packages/kokkos/core/unit_test/TestTeamVector.hpp b/packages/kokkos/core/unit_test/TestTeamVector.hpp
index ba11dc07a962989f2826a3d0def3649112c00da6..dbed67475615606915cfcc05959de312f9eacbfd 100644
--- a/packages/kokkos/core/unit_test/TestTeamVector.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamVector.hpp
@@ -44,7 +44,7 @@
 
 #include <Kokkos_Core.hpp>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <iostream>
 #include <cstdlib>
 #include <cstdint>
@@ -111,7 +111,7 @@ struct functor_team_for {
 
         if (test != value) {
           KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-              "FAILED team_parallel_for %i %i %f %f\n", team.league_rank(),
+              "FAILED team_parallel_for %i %i %lf %lf\n", team.league_rank(),
               team.team_rank(), static_cast<double>(test),
               static_cast<double>(value));
           flag() = 1;
@@ -321,10 +321,9 @@ struct functor_team_vector_for {
 
         if (test != value) {
           KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-              "FAILED team_vector_parallel_for %i %i %f %f\n",
+              "FAILED team_vector_parallel_for %i %i %lf %lf\n",
               team.league_rank(), team.team_rank(), static_cast<double>(test),
               static_cast<double>(value));
-
           flag() = 1;
         }
       });
@@ -372,7 +371,7 @@ struct functor_team_vector_reduce {
       if (test != value) {
         if (team.league_rank() == 0) {
           KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-              "FAILED team_vector_parallel_reduce %i %i %f %f %lu\n",
+              "FAILED team_vector_parallel_reduce %i %i %lf %lf %lu\n",
               team.league_rank(), team.team_rank(), static_cast<double>(test),
               static_cast<double>(value),
               static_cast<unsigned long>(sizeof(Scalar)));
@@ -424,7 +423,7 @@ struct functor_team_vector_reduce_reducer {
 
       if (test != value) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-            "FAILED team_vector_parallel_reduce_reducer %i %i %f %f\n",
+            "FAILED team_vector_parallel_reduce_reducer %i %i %lf %lf\n",
             team.league_rank(), team.team_rank(), static_cast<double>(test),
             static_cast<double>(value));
 
@@ -471,8 +470,9 @@ struct functor_vec_single {
 
     if (value2 != (value * Scalar(nEnd - nStart))) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-          "FAILED vector_single broadcast %i %i %f %f\n", team.league_rank(),
-          team.team_rank(), (double)value2, (double)value);
+          "FAILED vector_single broadcast %i %i %lf %lf\n", team.league_rank(),
+          team.team_rank(), static_cast<double>(value2),
+          static_cast<double>(value));
 
       flag() = 1;
     }
@@ -523,7 +523,7 @@ struct functor_vec_for {
         }
 
         if (test != value) {
-          KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED vector_par_for %i %i %f %f\n",
+          KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED vector_par_for %i %i %lf %lf\n",
                                         team.league_rank(), team.team_rank(),
                                         static_cast<double>(test),
                                         static_cast<double>(value));
@@ -560,10 +560,9 @@ struct functor_vec_red {
       for (int i = 0; i < 13; i++) test += i;
 
       if (test != value) {
-        KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED vector_par_reduce %i %i %f %f\n",
-                                      team.league_rank(), team.team_rank(),
-                                      (double)test, (double)value);
-
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+            "FAILED vector_par_reduce %i %i %lf %lf\n", team.league_rank(),
+            team.team_rank(), (double)test, (double)value);
         flag() = 1;
       }
     });
@@ -600,7 +599,7 @@ struct functor_vec_red_reducer {
 
       if (test != value) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-            "FAILED vector_par_reduce_reducer %i %i %f %f\n",
+            "FAILED vector_par_reduce_reducer %i %i %lf %lf\n",
             team.league_rank(), team.team_rank(), (double)test, (double)value);
 
         flag() = 1;
@@ -630,9 +629,10 @@ struct functor_vec_scan {
 
                               if (test != val) {
                                 KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-                                    "FAILED vector_par_scan %i %i %f %f\n",
+                                    "FAILED vector_par_scan %i %i %lf %lf\n",
                                     team.league_rank(), team.team_rank(),
-                                    (double)test, (double)val);
+                                    static_cast<double>(test),
+                                    static_cast<double>(val));
 
                                 flag() = 1;
                               }
@@ -723,7 +723,12 @@ template <class ExecutionSpace>
 bool Test(int test) {
   bool passed = true;
 
+// With SYCL 33*8 exceeds the maximum work group size
+#ifdef KOKKOS_ENABLE_SYCL
+  int team_size = 31;
+#else
   int team_size = 33;
+#endif
   if (team_size > int(ExecutionSpace::concurrency()))
     team_size = int(ExecutionSpace::concurrency());
   passed = passed && test_scalar<int, ExecutionSpace>(317, team_size, test);
@@ -856,7 +861,7 @@ template <typename ScalarType, class DeviceType>
 class TestTripleNestedReduce {
  public:
   using execution_space = DeviceType;
-  using size_type       = typename execution_space::size_type;
+  using size_type = typename execution_space::size_type;
 
   TestTripleNestedReduce(const size_type &, const size_type, const size_type &,
                          const size_type) {}
@@ -1000,17 +1005,24 @@ TEST(TEST_CATEGORY, triple_nested_parallelism) {
 // With KOKKOS_ENABLE_DEBUG enabled, the functor uses too many registers to run
 // with a team size of 32 on GPUs, 16 is the max possible (at least on a K80
 // GPU) See https://github.com/kokkos/kokkos/issues/1513
+// For Intel GPUs, the requested workgroup size is just too large here.
 #if defined(KOKKOS_ENABLE_DEBUG) && defined(KOKKOS_ENABLE_CUDA)
-  if (!std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value) {
+  if (!std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value)
+#elif defined(KOKKOS_ENABLE_SYCL)
+  if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::SYCL>::value)
 #endif
+  {
     TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 32, 32);
     TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 32, 16);
-#if defined(KOKKOS_ENABLE_DEBUG) && defined(KOKKOS_ENABLE_CUDA)
   }
+#if defined(KOKKOS_ENABLE_SYCL)
+  if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::SYCL>::value)
 #endif
+  {
+    TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 33);
+    TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 19);
+  }
   TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 16);
-  TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 33);
-  TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 19);
   TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 7, 16);
 }
 #endif
diff --git a/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp b/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp
index 7342ebad8433526719b52058ff6d6b75e41a107a..c4116b91392e2020ecf0a030f96536c3a47a6dfa 100644
--- a/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp
@@ -44,7 +44,7 @@
 
 #include <Kokkos_Core.hpp>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <iostream>
 #include <cstdlib>
 #include <cstdint>
@@ -280,7 +280,7 @@ struct functor_teamvector_for {
 
         if (test != value) {
           KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-              "FAILED teamvector_parallel_for %i %i %f %f\n",
+              "FAILED teamvector_parallel_for %i %i %lf %lf\n",
               team.league_rank(), team.team_rank(), static_cast<double>(test),
               static_cast<double>(value));
           flag() = 1;
@@ -493,7 +493,12 @@ template <class ExecutionSpace>
 bool Test(int test) {
   bool passed = true;
 
+// With SYCL 33*8 exceeds the maximum work group size
+#ifdef KOKKOS_ENABLE_SYCL
+  int team_size = 31;
+#else
   int team_size = 33;
+#endif
   if (team_size > int(ExecutionSpace::concurrency()))
     team_size = int(ExecutionSpace::concurrency());
   passed = passed && test_scalar<int, ExecutionSpace>(317, team_size, test);
diff --git a/packages/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp b/packages/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
index a0bc7c4304a040a10bc182e5d23d7c9ba08c4110..a0d00ded1b1586e67eb6bc09f93cf386239c3e2d 100644
--- a/packages/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
+++ b/packages/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
@@ -138,72 +138,38 @@ struct SumInitJoinFinalValueTypeArray {
   }
 };
 
-template <class Scalar, class ExecutionSpace>
-struct SumWrongInitJoinFinalValueType {
-  using execution_space = ExecutionSpace;
-  using type            = typename Kokkos::View<Scalar*, execution_space>;
-  using value_type      = Scalar;
-
-  type view;
-
-  SumWrongInitJoinFinalValueType(type view_) : view(view_) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void init(double& val) const { val = double(); }
-
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& val, const value_type& src) const {
-    val += src;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int /*i*/, value_type& val) const { val += value_type(); }
-};
-
 template <class Scalar, class ExecutionSpace>
 void TestTemplateMetaFunctions() {
-  using type = typename Kokkos::View<Scalar*, ExecutionSpace>;
-  type a("A", 100);
-  /*
-    int sum_plain_has_init_arg = Kokkos::Impl::FunctorHasInit< SumPlain<Scalar,
-    ExecutionSpace>, Scalar & >::value; ASSERT_EQ( sum_plain_has_init_arg, 0 );
-    int sum_initjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit<
-    SumInitJoinFinalValueType<Scalar, ExecutionSpace>, Scalar >::value;
-    ASSERT_EQ( sum_initjoinfinalvaluetype_has_init_arg, 1 );
-    int sum_initjoinfinalvaluetype_has_init_arg2 = Kokkos::Impl::FunctorHasInit<
-    SumInitJoinFinalValueType2<Scalar,ExecutionSpace>, Scalar >::value;
-    ASSERT_EQ( sum_initjoinfinalvaluetype_has_init_arg2, 1 );
-    int sum_wronginitjoinfinalvaluetype_has_init_arg =
-    Kokkos::Impl::FunctorHasInit< SumWrongInitJoinFinalValueType<Scalar,
-    ExecutionSpace>, Scalar >::value; ASSERT_EQ(
-    sum_wronginitjoinfinalvaluetype_has_init_arg, 0 );
-
-    //int sum_initjoinfinalvaluetypearray_has_init_arg =
-    Kokkos::Impl::FunctorHasInit< SumInitJoinFinalValueTypeArray<Scalar,
-    ExecutionSpace>, Scalar[] >::value;
-    //ASSERT_EQ( sum_initjoinfinalvaluetypearray_has_init_arg, 1 );
-
-    //printf( "Values Init: %i %i %i\n", sum_plain_has_init_arg,
-    sum_initjoinfinalvaluetype_has_init_arg,
-    sum_wronginitjoinfinalvaluetype_has_init_arg );
-
-    int sum_plain_has_join_arg = Kokkos::Impl::FunctorHasJoin< SumPlain<Scalar,
-    ExecutionSpace>, Scalar >::value; ASSERT_EQ( sum_plain_has_join_arg, 0 );
-    int sum_initjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin<
-    SumInitJoinFinalValueType<Scalar, ExecutionSpace>, Scalar >::value;
-    ASSERT_EQ( sum_initjoinfinalvaluetype_has_join_arg, 1 );
-    int sum_initjoinfinalvaluetype_has_join_arg2 = Kokkos::Impl::FunctorHasJoin<
-    SumInitJoinFinalValueType2<Scalar, ExecutionSpace>, Scalar >::value;
-    ASSERT_EQ( sum_initjoinfinalvaluetype_has_join_arg2, 1 );
-    int sum_wronginitjoinfinalvaluetype_has_join_arg =
-    Kokkos::Impl::FunctorHasJoin< SumWrongInitJoinFinalValueType<Scalar,
-    ExecutionSpace>, Scalar >::value; ASSERT_EQ(
-    sum_wronginitjoinfinalvaluetype_has_join_arg, 0 );
-
-    //printf( "Values Join: %i %i %i\n", sum_plain_has_join_arg,
-    sum_initjoinfinalvaluetype_has_join_arg,
-    sum_wronginitjoinfinalvaluetype_has_join_arg );
-  */
+  static_assert(
+      Kokkos::Impl::ReduceFunctorHasInit<SumPlain<Scalar, ExecutionSpace>,
+                                         Scalar&>::value == false,
+      "");
+  static_assert(
+      Kokkos::Impl::ReduceFunctorHasInit<
+          SumInitJoinFinalValueType<Scalar, ExecutionSpace>>::value == true,
+      "");
+  static_assert(
+      Kokkos::Impl::ReduceFunctorHasInit<
+          SumInitJoinFinalValueType2<Scalar, ExecutionSpace>>::value == true,
+      "");
+
+  static_assert(
+      Kokkos::Impl::ReduceFunctorHasInit<
+          SumInitJoinFinalValueTypeArray<Scalar, ExecutionSpace>>::value ==
+          true,
+      "");
+
+  static_assert(Kokkos::Impl::ReduceFunctorHasJoin<
+                    SumPlain<Scalar, ExecutionSpace>>::value == false,
+                "");
+  static_assert(
+      Kokkos::Impl::ReduceFunctorHasJoin<
+          SumInitJoinFinalValueType<Scalar, ExecutionSpace>>::value == true,
+      "");
+  static_assert(
+      Kokkos::Impl::ReduceFunctorHasJoin<
+          SumInitJoinFinalValueType2<Scalar, ExecutionSpace>>::value == true,
+      "");
 }
 
 }  // namespace
diff --git a/packages/kokkos/core/unit_test/TestTypeList.cpp b/packages/kokkos/core/unit_test/TestTypeList.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e450d11562819c756334daeea91f3f448df624db
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestTypeList.cpp
@@ -0,0 +1,73 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <impl/Kokkos_Utilities.hpp>
+
+using TypeList2 = Kokkos::Impl::type_list<void, bool>;
+using TypeList3 = Kokkos::Impl::type_list<char, short, int>;
+using TypeList223 =
+    Kokkos::Impl::type_list<void, bool, void, bool, char, short, int>;
+using TypeList223Void   = Kokkos::Impl::type_list<void, void>;
+using TypeList223NoVoid = Kokkos::Impl::type_list<bool, bool, char, short, int>;
+
+// concat_type_list
+using ConcatTypeList2 = Kokkos::Impl::concat_type_list_t<TypeList2>;
+static_assert(std::is_same<TypeList2, ConcatTypeList2>::value,
+              "concat_type_list of a single type_list failed");
+
+using ConcatTypeList223 =
+    Kokkos::Impl::concat_type_list_t<TypeList2, TypeList2, TypeList3>;
+static_assert(std::is_same<TypeList223, ConcatTypeList223>::value,
+              "concat_type_list of three type_lists failed");
+
+// filter_type_list
+using FilterTypeList223Void =
+    Kokkos::Impl::filter_type_list_t<std::is_void, TypeList223>;
+static_assert(std::is_same<TypeList223Void, FilterTypeList223Void>::value,
+              "filter_type_list with predicate value==true failed");
+
+using FilterTypeList223NoVoid =
+    Kokkos::Impl::filter_type_list_t<std::is_void, TypeList223, false>;
+static_assert(std::is_same<TypeList223NoVoid, FilterTypeList223NoVoid>::value,
+              "filter_type_list with predicate value==false failed");
diff --git a/packages/kokkos/core/unit_test/TestViewAPI.hpp b/packages/kokkos/core/unit_test/TestViewAPI.hpp
index 570281f9fd66a230e69b9bb924a84a0078e12168..73531e6196f0ca145789ef98f680328ece747df9 100644
--- a/packages/kokkos/core/unit_test/TestViewAPI.hpp
+++ b/packages/kokkos/core/unit_test/TestViewAPI.hpp
@@ -1060,12 +1060,12 @@ class TestViewAPI {
     dView4 dx, dy, dz;
     hView4 hx, hy, hz;
 
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_TRUE(dy.data() == nullptr);
-    ASSERT_TRUE(dz.data() == nullptr);
-    ASSERT_TRUE(hx.data() == nullptr);
-    ASSERT_TRUE(hy.data() == nullptr);
-    ASSERT_TRUE(hz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_EQ(dy.data(), nullptr);
+    ASSERT_EQ(dz.data(), nullptr);
+    ASSERT_EQ(hx.data(), nullptr);
+    ASSERT_EQ(hy.data(), nullptr);
+    ASSERT_EQ(hz.data(), nullptr);
     ASSERT_EQ(dx.extent(0), 0u);
     ASSERT_EQ(dy.extent(0), 0u);
     ASSERT_EQ(dz.extent(0), 0u);
@@ -1116,11 +1116,11 @@ class TestViewAPI {
 
     ASSERT_EQ(dx.use_count(), size_t(2));
 
-    ASSERT_FALSE(dx.data() == nullptr);
-    ASSERT_FALSE(const_dx.data() == nullptr);
-    ASSERT_FALSE(unmanaged_dx.data() == nullptr);
-    ASSERT_FALSE(unmanaged_from_ptr_dx.data() == nullptr);
-    ASSERT_FALSE(dy.data() == nullptr);
+    ASSERT_NE(dx.data(), nullptr);
+    ASSERT_NE(const_dx.data(), nullptr);
+    ASSERT_NE(unmanaged_dx.data(), nullptr);
+    ASSERT_NE(unmanaged_from_ptr_dx.data(), nullptr);
+    ASSERT_NE(dy.data(), nullptr);
     ASSERT_NE(dx, dy);
 
     ASSERT_EQ(dx.extent(0), unsigned(N0));
@@ -1257,19 +1257,19 @@ class TestViewAPI {
     ASSERT_NE(dx, dz);
 
     dx = dView4();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_FALSE(dy.data() == nullptr);
-    ASSERT_FALSE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_NE(dy.data(), nullptr);
+    ASSERT_NE(dz.data(), nullptr);
 
     dy = dView4();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_TRUE(dy.data() == nullptr);
-    ASSERT_FALSE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_EQ(dy.data(), nullptr);
+    ASSERT_NE(dz.data(), nullptr);
 
     dz = dView4();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_TRUE(dy.data() == nullptr);
-    ASSERT_TRUE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_EQ(dy.data(), nullptr);
+    ASSERT_EQ(dz.data(), nullptr);
   }
 
   static void run_test_deep_copy_empty() {
@@ -1304,7 +1304,7 @@ class TestViewAPI {
   static void check_auto_conversion_to_const(
       const Kokkos::View<const DataType, device> &arg_const,
       const Kokkos::View<DataType, device> &arg) {
-    ASSERT_TRUE(arg_const == arg);
+    ASSERT_EQ(arg_const, arg);
   }
 
   static void run_test_const() {
@@ -1317,8 +1317,8 @@ class TestViewAPI {
     const_typeX xc = x;
     const_typeR xr = x;
 
-    ASSERT_TRUE(xc == x);
-    ASSERT_TRUE(x == xc);
+    ASSERT_EQ(xc, x);
+    ASSERT_EQ(x, xc);
 
     // For CUDA the constant random access View does not return
     // an lvalue reference due to retrieving through texture cache
@@ -1327,7 +1327,7 @@ class TestViewAPI {
     if (!std::is_same<typename device::execution_space, Kokkos::Cuda>::value)
 #endif
     {
-      ASSERT_TRUE(x.data() == xr.data());
+      ASSERT_EQ(x.data(), xr.data());
     }
 
     // typeX xf = xc; // Setting non-const from const must not compile.
@@ -1440,29 +1440,29 @@ class TestViewAPI {
     const_vector_right_type cvr2 = Kokkos::subview(mv, Kokkos::ALL(), 1);
     const_vector_right_type cvr3 = Kokkos::subview(mv, Kokkos::ALL(), 2);
 
-    ASSERT_TRUE(&v1[0] == &v1(0));
-    ASSERT_TRUE(&v1[0] == &mv(0, 0));
-    ASSERT_TRUE(&v2[0] == &mv(0, 1));
-    ASSERT_TRUE(&v3[0] == &mv(0, 2));
-
-    ASSERT_TRUE(&cv1[0] == &mv(0, 0));
-    ASSERT_TRUE(&cv2[0] == &mv(0, 1));
-    ASSERT_TRUE(&cv3[0] == &mv(0, 2));
-
-    ASSERT_TRUE(&vr1[0] == &mv(0, 0));
-    ASSERT_TRUE(&vr2[0] == &mv(0, 1));
-    ASSERT_TRUE(&vr3[0] == &mv(0, 2));
-
-    ASSERT_TRUE(&cvr1[0] == &mv(0, 0));
-    ASSERT_TRUE(&cvr2[0] == &mv(0, 1));
-    ASSERT_TRUE(&cvr3[0] == &mv(0, 2));
-
-    ASSERT_TRUE(&mv1(0, 0) == &mv(1, 2));
-    ASSERT_TRUE(&mv1(1, 1) == &mv(2, 3));
-    ASSERT_TRUE(&mv1(3, 2) == &mv(4, 4));
-    ASSERT_TRUE(&mvr1(0, 0) == &mv_right(1, 2));
-    ASSERT_TRUE(&mvr1(1, 1) == &mv_right(2, 3));
-    ASSERT_TRUE(&mvr1(3, 2) == &mv_right(4, 4));
+    ASSERT_EQ(&v1[0], &v1(0));
+    ASSERT_EQ(&v1[0], &mv(0, 0));
+    ASSERT_EQ(&v2[0], &mv(0, 1));
+    ASSERT_EQ(&v3[0], &mv(0, 2));
+
+    ASSERT_EQ(&cv1[0], &mv(0, 0));
+    ASSERT_EQ(&cv2[0], &mv(0, 1));
+    ASSERT_EQ(&cv3[0], &mv(0, 2));
+
+    ASSERT_EQ(&vr1[0], &mv(0, 0));
+    ASSERT_EQ(&vr2[0], &mv(0, 1));
+    ASSERT_EQ(&vr3[0], &mv(0, 2));
+
+    ASSERT_EQ(&cvr1[0], &mv(0, 0));
+    ASSERT_EQ(&cvr2[0], &mv(0, 1));
+    ASSERT_EQ(&cvr3[0], &mv(0, 2));
+
+    ASSERT_EQ(&mv1(0, 0), &mv(1, 2));
+    ASSERT_EQ(&mv1(1, 1), &mv(2, 3));
+    ASSERT_EQ(&mv1(3, 2), &mv(4, 4));
+    ASSERT_EQ(&mvr1(0, 0), &mv_right(1, 2));
+    ASSERT_EQ(&mvr1(1, 1), &mv_right(2, 3));
+    ASSERT_EQ(&mvr1(3, 2), &mv_right(4, 4));
 
     const_vector_type c_cv1(v1);
     typename vector_type::const_type c_cv2(v2);
diff --git a/packages/kokkos/core/unit_test/TestViewAPI_e.hpp b/packages/kokkos/core/unit_test/TestViewAPI_e.hpp
index a5dc6cf29a467bd576bd96bca52f90b3db26324b..d4f484a530c952a33b20dada3222180c7785f06a 100644
--- a/packages/kokkos/core/unit_test/TestViewAPI_e.hpp
+++ b/packages/kokkos/core/unit_test/TestViewAPI_e.hpp
@@ -54,23 +54,24 @@ namespace Test {
 TEST(TEST_CATEGORY, view_remap) {
   enum { N0 = 3, N1 = 2, N2 = 8, N3 = 9 };
 
-#ifdef KOKKOS_ENABLE_CUDA
+#if defined(KOKKOS_ENABLE_CUDA)
 #define EXECSPACE                                                     \
   std::conditional<std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value, \
                    Kokkos::CudaHostPinnedSpace, TEST_EXECSPACE>::type
-#else
-#ifdef KOKKOS_ENABLE_HIP
+#elif defined(KOKKOS_ENABLE_HIP)
 #define EXECSPACE                                                     \
   std::conditional<                                                   \
       std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value, \
       Kokkos::Experimental::HIPHostPinnedSpace, TEST_EXECSPACE>::type
-#else
-#if defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_SYCL)
+#elif defined(KOKKOS_ENABLE_SYCL)
+#define EXECSPACE                                                      \
+  std::conditional<                                                    \
+      std::is_same<TEST_EXECSPACE, Kokkos::Experimental::SYCL>::value, \
+      Kokkos::Experimental::SYCLHostUSMSpace, TEST_EXECSPACE>::type
+#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
 #define EXECSPACE Kokkos::HostSpace
 #else
 #define EXECSPACE TEST_EXECSPACE
-#endif
-#endif
 #endif
 
   using output_type =
diff --git a/packages/kokkos/core/unit_test/TestViewCopy_a.hpp b/packages/kokkos/core/unit_test/TestViewCopy_a.hpp
index e25cb9e39ca6fd4c3cd45ef2b60b404ed82c03e7..ced0aa3828cffabfc196fd9dac146718c0f005d3 100644
--- a/packages/kokkos/core/unit_test/TestViewCopy_a.hpp
+++ b/packages/kokkos/core/unit_test/TestViewCopy_a.hpp
@@ -96,10 +96,10 @@ TEST(TEST_CATEGORY, view_copy_tests) {
   auto host = Kokkos::DefaultHostExecutionSpace();
 
   constexpr bool DevExecCanAccessHost =
-      Kokkos::Impl::SpaceAccessibility<typename TEST_EXECSPACE::execution_space,
-                                       Kokkos::HostSpace>::accessible;
+      Kokkos::SpaceAccessibility<typename TEST_EXECSPACE::execution_space,
+                                 Kokkos::HostSpace>::accessible;
 
-  constexpr bool HostExecCanAccessDev = Kokkos::Impl::SpaceAccessibility<
+  constexpr bool HostExecCanAccessDev = Kokkos::SpaceAccessibility<
       typename Kokkos::HostSpace::execution_space,
       typename TEST_EXECSPACE::memory_space>::accessible;
 
diff --git a/packages/kokkos/core/unit_test/TestViewMapping_a.hpp b/packages/kokkos/core/unit_test/TestViewMapping_a.hpp
index fdbda099176c79410c1be6599546f09aba3269dc..974d7c98cafb56c91df55b425913b14c0dfd3ca1 100644
--- a/packages/kokkos/core/unit_test/TestViewMapping_a.hpp
+++ b/packages/kokkos/core/unit_test/TestViewMapping_a.hpp
@@ -768,8 +768,8 @@ void test_view_mapping() {
 
     ASSERT_EQ(vr1.extent(0), N);
 
-    if (Kokkos::Impl::SpaceAccessibility<
-            Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+    if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                   typename Space::memory_space>::accessible) {
       for (int i = 0; i < N; ++i) data[i] = i + 1;
       for (int i = 0; i < N; ++i) ASSERT_EQ(vr1[i], i + 1);
       for (int i = 0; i < N; ++i) ASSERT_EQ(cr1[i], i + 1);
@@ -815,8 +815,8 @@ void test_view_mapping() {
 
     ASSERT_EQ(vr1.extent(0), N);
 
-    if (Kokkos::Impl::SpaceAccessibility<
-            Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+    if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                   typename Space::memory_space>::accessible) {
       for (int i = 0; i < N; ++i) vr1(i) = i + 1;
       for (int i = 0; i < N; ++i) ASSERT_EQ(vr1[i], i + 1);
       for (int i = 0; i < N; ++i) ASSERT_EQ(cr1[i], i + 1);
diff --git a/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp b/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp
index 18db67400d6ea03ecb98e891150cb4a154311982..2a15a84380e7c6d979059a8342c64b9ee68d2eb9 100644
--- a/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp
+++ b/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp
@@ -81,7 +81,7 @@ struct TestViewMappingSubview {
   using DLT  = Kokkos::View<int** * [13][14], Kokkos::LayoutLeft, ExecSpace>;
   using DLS1 = Kokkos::Subview<DLT, range, int, int, int, int>;
 
-#if !defined(KOKKOS_IMPL_CUDA_VERSION_9_WORKAROUND)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000
   static_assert(
       DLS1::rank == 1 &&
           std::is_same<typename DLS1::array_layout, Kokkos::LayoutLeft>::value,
@@ -92,7 +92,7 @@ struct TestViewMappingSubview {
   using DRT  = Kokkos::View<int** * [13][14], Kokkos::LayoutRight, ExecSpace>;
   using DRS1 = Kokkos::Subview<DRT, int, int, int, int, range>;
 
-#if !defined(KOKKOS_IMPL_CUDA_VERSION_9_WORKAROUND)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000
   static_assert(
       DRS1::rank == 1 &&
           std::is_same<typename DRS1::array_layout, Kokkos::LayoutRight>::value,
diff --git a/packages/kokkos/core/unit_test/TestViewSubview.hpp b/packages/kokkos/core/unit_test/TestViewSubview.hpp
index 0125017d93786101e2a23a866effe9d8a5e5242d..93eb5476b57be796f5e1fbcb8e9b3db140bb1615 100644
--- a/packages/kokkos/core/unit_test/TestViewSubview.hpp
+++ b/packages/kokkos/core/unit_test/TestViewSubview.hpp
@@ -184,7 +184,7 @@ void test_auto_1d() {
   Kokkos::deep_copy(X_h, X);
   for (size_type j = 0; j < numCols; ++j) {
     for (size_type i = 0; i < numRows; ++i) {
-      ASSERT_TRUE(X_h(i, j) == ONE);
+      ASSERT_EQ(X_h(i, j), ONE);
     }
   }
 
@@ -194,7 +194,7 @@ void test_auto_1d() {
   Kokkos::deep_copy(X_h, X);
   for (size_type j = 0; j < numCols; ++j) {
     for (size_type i = 0; i < numRows; ++i) {
-      ASSERT_TRUE(X_h(i, j) == ZERO);
+      ASSERT_EQ(X_h(i, j), ZERO);
     }
   }
 
@@ -204,7 +204,7 @@ void test_auto_1d() {
   Kokkos::deep_copy(X_h, X);
   for (size_type j = 0; j < numCols; ++j) {
     for (size_type i = 0; i < numRows; ++i) {
-      ASSERT_TRUE(X_h(i, j) == TWO);
+      ASSERT_EQ(X_h(i, j), TWO);
     }
   }
 
@@ -216,7 +216,7 @@ void test_auto_1d() {
     Kokkos::fence();
     Kokkos::deep_copy(X_h, X);
     for (size_type i = 0; i < numRows; ++i) {
-      ASSERT_TRUE(X_h(i, j) == ZERO);
+      ASSERT_EQ(X_h(i, j), ZERO);
     }
 
     for (size_type jj = 0; jj < numCols; ++jj) {
@@ -226,7 +226,7 @@ void test_auto_1d() {
       Kokkos::fence();
       Kokkos::deep_copy(X_h, X);
       for (size_type i = 0; i < numRows; ++i) {
-        ASSERT_TRUE(X_h(i, jj) == ONE);
+        ASSERT_EQ(X_h(i, jj), ONE);
       }
     }
   }
@@ -240,38 +240,38 @@ void test_1d_strided_assignment_impl(bool a, bool b, bool c, bool d, int n,
   int col = n > 2 ? 2 : 0;
   int row = m > 2 ? 2 : 0;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     if (a) {
       Kokkos::View<double*, LD, Space> l1da =
           Kokkos::subview(l2d, Kokkos::ALL, row);
-      ASSERT_TRUE(&l1da(0) == &l2d(0, row));
+      ASSERT_EQ(&l1da(0), &l2d(0, row));
       if (n > 1) {
-        ASSERT_TRUE(&l1da(1) == &l2d(1, row));
+        ASSERT_EQ(&l1da(1), &l2d(1, row));
       }
     }
 
     if (b && n > 13) {
       Kokkos::View<double*, LD, Space> l1db =
           Kokkos::subview(l2d, std::pair<unsigned, unsigned>(2, 13), row);
-      ASSERT_TRUE(&l1db(0) == &l2d(2, row));
-      ASSERT_TRUE(&l1db(1) == &l2d(3, row));
+      ASSERT_EQ(&l1db(0), &l2d(2, row));
+      ASSERT_EQ(&l1db(1), &l2d(3, row));
     }
 
     if (c) {
       Kokkos::View<double*, LD, Space> l1dc =
           Kokkos::subview(l2d, col, Kokkos::ALL);
-      ASSERT_TRUE(&l1dc(0) == &l2d(col, 0));
+      ASSERT_EQ(&l1dc(0), &l2d(col, 0));
       if (m > 1) {
-        ASSERT_TRUE(&l1dc(1) == &l2d(col, 1));
+        ASSERT_EQ(&l1dc(1), &l2d(col, 1));
       }
     }
 
     if (d && m > 13) {
       Kokkos::View<double*, LD, Space> l1dd =
           Kokkos::subview(l2d, col, std::pair<unsigned, unsigned>(2, 13));
-      ASSERT_TRUE(&l1dd(0) == &l2d(col, 2));
-      ASSERT_TRUE(&l1dd(1) == &l2d(col, 3));
+      ASSERT_EQ(&l1dd(0), &l2d(col, 2));
+      ASSERT_EQ(&l1dd(1), &l2d(col, 3));
     }
   }
 }
@@ -326,8 +326,8 @@ void test_left_0(bool constr) {
   using view_static_8_type =
       Kokkos::View<int[2][3][4][5][2][3][4][5], Kokkos::LayoutLeft, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_static_8_type x_static_8("x_static_left_8");
 
     ASSERT_TRUE(x_static_8.span_is_contiguous());
@@ -337,7 +337,7 @@ void test_left_0(bool constr) {
 
     ASSERT_TRUE(x0.span_is_contiguous());
     ASSERT_EQ(x0.span(), 1);
-    ASSERT_TRUE(&x0() == &x_static_8(0, 0, 0, 0, 0, 0, 0, 0));
+    ASSERT_EQ(&x0(), &x_static_8(0, 0, 0, 0, 0, 0, 0, 0));
 
     Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1;
     make_subview(constr, x1, x_static_8, Kokkos::pair<int, int>(0, 2), 1, 2, 3,
@@ -345,8 +345,8 @@ void test_left_0(bool constr) {
 
     ASSERT_TRUE(x1.span_is_contiguous());
     ASSERT_EQ(x1.span(), 2);
-    ASSERT_TRUE(&x1(0) == &x_static_8(0, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&x1(1) == &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x1(0), &x_static_8(0, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x1(1), &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
 
     Kokkos::View<int*, Kokkos::LayoutLeft, Space> x_deg1;
     make_subview(constr, x_deg1, x_static_8, Kokkos::pair<int, int>(0, 0), 1, 2,
@@ -369,10 +369,10 @@ void test_left_0(bool constr) {
                  Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
     ASSERT_TRUE(!x2.span_is_contiguous());
-    ASSERT_TRUE(&x2(0, 0) == &x_static_8(0, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&x2(1, 0) == &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&x2(0, 1) == &x_static_8(0, 1, 2, 3, 1, 1, 2, 3));
-    ASSERT_TRUE(&x2(1, 1) == &x_static_8(1, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&x2(0, 0), &x_static_8(0, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x2(1, 0), &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x2(0, 1), &x_static_8(0, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&x2(1, 1), &x_static_8(1, 1, 2, 3, 1, 1, 2, 3));
 
     // Kokkos::View< int**, Kokkos::LayoutLeft, Space > error_2 =
     Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2;
@@ -380,10 +380,10 @@ void test_left_0(bool constr) {
                  Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
     ASSERT_TRUE(!sx2.span_is_contiguous());
-    ASSERT_TRUE(&sx2(0, 0) == &x_static_8(1, 0, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 0) == &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(0, 1) == &x_static_8(1, 0, 2, 3, 1, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 1) == &x_static_8(1, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 0), &x_static_8(1, 0, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 0), &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 1), &x_static_8(1, 0, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 1), &x_static_8(1, 1, 2, 3, 1, 1, 2, 3));
 
     Kokkos::View<int****, Kokkos::LayoutStride, Space> sx4;
     make_subview(constr, sx4, x_static_8, 0,
@@ -402,9 +402,8 @@ void test_left_0(bool constr) {
       for (int i1 = 0; i1 < (int)sx4.extent(1); ++i1)
         for (int i2 = 0; i2 < (int)sx4.extent(2); ++i2)
           for (int i3 = 0; i3 < (int)sx4.extent(3); ++i3) {
-            ASSERT_TRUE(&sx4(i0, i1, i2, i3) == &x_static_8(0, 0 + i0, 1,
-                                                            1 + i1, 1, 0 + i2,
-                                                            2, 2 + i3));
+            ASSERT_EQ(&sx4(i0, i1, i2, i3),
+                      &x_static_8(0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3));
           }
   }
 }
@@ -420,8 +419,8 @@ void test_left_1(bool use_constr) {
   using view_type =
       Kokkos::View<int*** * [2][3][4][5], Kokkos::LayoutLeft, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_type x8("x_left_8", 2, 3, 4, 5);
 
     ASSERT_TRUE(x8.span_is_contiguous());
@@ -430,15 +429,15 @@ void test_left_1(bool use_constr) {
     make_subview(use_constr, x0, x8, 0, 0, 0, 0, 0, 0, 0, 0);
 
     ASSERT_TRUE(x0.span_is_contiguous());
-    ASSERT_TRUE(&x0() == &x8(0, 0, 0, 0, 0, 0, 0, 0));
+    ASSERT_EQ(&x0(), &x8(0, 0, 0, 0, 0, 0, 0, 0));
 
     Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1;
     make_subview(use_constr, x1, x8, Kokkos::pair<int, int>(0, 2), 1, 2, 3, 0,
                  1, 2, 3);
 
     ASSERT_TRUE(x1.span_is_contiguous());
-    ASSERT_TRUE(&x1(0) == &x8(0, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&x1(1) == &x8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x1(0), &x8(0, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x1(1), &x8(1, 1, 2, 3, 0, 1, 2, 3));
 
     Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1_deg1;
     make_subview(use_constr, x1_deg1, x8, Kokkos::pair<int, int>(0, 0), 1, 2, 3,
@@ -461,10 +460,10 @@ void test_left_1(bool use_constr) {
                  Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
     ASSERT_TRUE(!x2.span_is_contiguous());
-    ASSERT_TRUE(&x2(0, 0) == &x8(0, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&x2(1, 0) == &x8(1, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&x2(0, 1) == &x8(0, 1, 2, 3, 1, 1, 2, 3));
-    ASSERT_TRUE(&x2(1, 1) == &x8(1, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&x2(0, 0), &x8(0, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x2(1, 0), &x8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x2(0, 1), &x8(0, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&x2(1, 1), &x8(1, 1, 2, 3, 1, 1, 2, 3));
 
     Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2_deg2;
     make_subview(use_constr, x2_deg2, x8, Kokkos::pair<int, int>(2, 2), 2, 3, 4,
@@ -477,10 +476,10 @@ void test_left_1(bool use_constr) {
                  Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
     ASSERT_TRUE(!sx2.span_is_contiguous());
-    ASSERT_TRUE(&sx2(0, 0) == &x8(1, 0, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 0) == &x8(1, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(0, 1) == &x8(1, 0, 2, 3, 1, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 1) == &x8(1, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 0), &x8(1, 0, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 0), &x8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 1), &x8(1, 0, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 1), &x8(1, 1, 2, 3, 1, 1, 2, 3));
 
     Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2_deg;
     make_subview(use_constr, sx2, x8, 1, Kokkos::pair<int, int>(0, 0), 2, 3,
@@ -520,8 +519,8 @@ template <class Space>
 void test_left_2() {
   using view_type = Kokkos::View<int****, Kokkos::LayoutLeft, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_type x4("x4", 2, 3, 4, 5);
 
     ASSERT_TRUE(x4.span_is_contiguous());
@@ -530,35 +529,35 @@ void test_left_2() {
         Kokkos::subview(x4, 0, 0, 0, 0);
 
     ASSERT_TRUE(x0.span_is_contiguous());
-    ASSERT_TRUE(&x0() == &x4(0, 0, 0, 0));
+    ASSERT_EQ(&x0(), &x4(0, 0, 0, 0));
 
     Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1 =
         Kokkos::subview(x4, Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
     ASSERT_TRUE(x1.span_is_contiguous());
-    ASSERT_TRUE(&x1(0) == &x4(0, 1, 2, 3));
-    ASSERT_TRUE(&x1(1) == &x4(1, 1, 2, 3));
+    ASSERT_EQ(&x1(0), &x4(0, 1, 2, 3));
+    ASSERT_EQ(&x1(1), &x4(1, 1, 2, 3));
 
     Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2 = Kokkos::subview(
         x4, Kokkos::pair<int, int>(0, 2), 1, Kokkos::pair<int, int>(1, 3), 2);
 
     ASSERT_TRUE(!x2.span_is_contiguous());
-    ASSERT_TRUE(&x2(0, 0) == &x4(0, 1, 1, 2));
-    ASSERT_TRUE(&x2(1, 0) == &x4(1, 1, 1, 2));
-    ASSERT_TRUE(&x2(0, 1) == &x4(0, 1, 2, 2));
-    ASSERT_TRUE(&x2(1, 1) == &x4(1, 1, 2, 2));
+    ASSERT_EQ(&x2(0, 0), &x4(0, 1, 1, 2));
+    ASSERT_EQ(&x2(1, 0), &x4(1, 1, 1, 2));
+    ASSERT_EQ(&x2(0, 1), &x4(0, 1, 2, 2));
+    ASSERT_EQ(&x2(1, 1), &x4(1, 1, 2, 2));
 
     // Kokkos::View< int**, Kokkos::LayoutLeft, Space > error_2 =
     Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2 = Kokkos::subview(
         x4, 1, Kokkos::pair<int, int>(0, 2), 2, Kokkos::pair<int, int>(1, 4));
 
     ASSERT_TRUE(!sx2.span_is_contiguous());
-    ASSERT_TRUE(&sx2(0, 0) == &x4(1, 0, 2, 1));
-    ASSERT_TRUE(&sx2(1, 0) == &x4(1, 1, 2, 1));
-    ASSERT_TRUE(&sx2(0, 1) == &x4(1, 0, 2, 2));
-    ASSERT_TRUE(&sx2(1, 1) == &x4(1, 1, 2, 2));
-    ASSERT_TRUE(&sx2(0, 2) == &x4(1, 0, 2, 3));
-    ASSERT_TRUE(&sx2(1, 2) == &x4(1, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 0), &x4(1, 0, 2, 1));
+    ASSERT_EQ(&sx2(1, 0), &x4(1, 1, 2, 1));
+    ASSERT_EQ(&sx2(0, 1), &x4(1, 0, 2, 2));
+    ASSERT_EQ(&sx2(1, 1), &x4(1, 1, 2, 2));
+    ASSERT_EQ(&sx2(0, 2), &x4(1, 0, 2, 3));
+    ASSERT_EQ(&sx2(1, 2), &x4(1, 1, 2, 3));
 
     Kokkos::View<int****, Kokkos::LayoutStride, Space> sx4 =
         Kokkos::subview(x4, Kokkos::pair<int, int>(1, 2) /* of [2] */
@@ -586,8 +585,8 @@ template <class Space>
 void test_left_3() {
   using view_type = Kokkos::View<int**, Kokkos::LayoutLeft, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_type xm("x4", 10, 5);
 
     ASSERT_TRUE(xm.span_is_contiguous());
@@ -595,14 +594,14 @@ void test_left_3() {
     Kokkos::View<int, Kokkos::LayoutLeft, Space> x0 = Kokkos::subview(xm, 5, 3);
 
     ASSERT_TRUE(x0.span_is_contiguous());
-    ASSERT_TRUE(&x0() == &xm(5, 3));
+    ASSERT_EQ(&x0(), &xm(5, 3));
 
     Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1 =
         Kokkos::subview(xm, Kokkos::ALL, 3);
 
     ASSERT_TRUE(x1.span_is_contiguous());
     for (int i = 0; i < int(xm.extent(0)); ++i) {
-      ASSERT_TRUE(&x1(i) == &xm(i, 3));
+      ASSERT_EQ(&x1(i), &xm(i, 3));
     }
 
     Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2 =
@@ -611,7 +610,7 @@ void test_left_3() {
     ASSERT_TRUE(!x2.span_is_contiguous());
     for (int j = 0; j < int(x2.extent(1)); ++j)
       for (int i = 0; i < int(x2.extent(0)); ++i) {
-        ASSERT_TRUE(&x2(i, j) == &xm(1 + i, j));
+        ASSERT_EQ(&x2(i, j), &xm(1 + i, j));
       }
 
     Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2c =
@@ -620,20 +619,20 @@ void test_left_3() {
     ASSERT_TRUE(x2c.span_is_contiguous());
     for (int j = 0; j < int(x2c.extent(1)); ++j)
       for (int i = 0; i < int(x2c.extent(0)); ++i) {
-        ASSERT_TRUE(&x2c(i, j) == &xm(i, 2 + j));
+        ASSERT_EQ(&x2c(i, j), &xm(i, 2 + j));
       }
 
     Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2_n1 =
         Kokkos::subview(xm, std::pair<int, int>(1, 1), Kokkos::ALL);
 
-    ASSERT_TRUE(x2_n1.extent(0) == 0);
-    ASSERT_TRUE(x2_n1.extent(1) == xm.extent(1));
+    ASSERT_EQ(x2_n1.extent(0), 0);
+    ASSERT_EQ(x2_n1.extent(1), xm.extent(1));
 
     Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2_n2 =
         Kokkos::subview(xm, Kokkos::ALL, std::pair<int, int>(1, 1));
 
-    ASSERT_TRUE(x2_n2.extent(0) == xm.extent(0));
-    ASSERT_TRUE(x2_n2.extent(1) == 0);
+    ASSERT_EQ(x2_n2.extent(0), xm.extent(0));
+    ASSERT_EQ(x2_n2.extent(1), 0);
   }
 }
 
@@ -644,46 +643,46 @@ void test_right_0(bool use_constr) {
   using view_static_8_type =
       Kokkos::View<int[2][3][4][5][2][3][4][5], Kokkos::LayoutRight, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_static_8_type x_static_8("x_static_right_8");
 
     Kokkos::View<int, Kokkos::LayoutRight, Space> x0;
     make_subview(use_constr, x0, x_static_8, 0, 0, 0, 0, 0, 0, 0, 0);
 
-    ASSERT_TRUE(&x0() == &x_static_8(0, 0, 0, 0, 0, 0, 0, 0));
+    ASSERT_EQ(&x0(), &x_static_8(0, 0, 0, 0, 0, 0, 0, 0));
 
     Kokkos::View<int*, Kokkos::LayoutRight, Space> x1;
     make_subview(use_constr, x1, x_static_8, 0, 1, 2, 3, 0, 1, 2,
                  Kokkos::pair<int, int>(1, 3));
 
-    ASSERT_TRUE(x1.extent(0) == 2);
-    ASSERT_TRUE(&x1(0) == &x_static_8(0, 1, 2, 3, 0, 1, 2, 1));
-    ASSERT_TRUE(&x1(1) == &x_static_8(0, 1, 2, 3, 0, 1, 2, 2));
+    ASSERT_EQ(x1.extent(0), 2);
+    ASSERT_EQ(&x1(0), &x_static_8(0, 1, 2, 3, 0, 1, 2, 1));
+    ASSERT_EQ(&x1(1), &x_static_8(0, 1, 2, 3, 0, 1, 2, 2));
 
     Kokkos::View<int**, Kokkos::LayoutRight, Space> x2;
     make_subview(use_constr, x2, x_static_8, 0, 1, 2,
                  Kokkos::pair<int, int>(1, 3), 0, 1, 2,
                  Kokkos::pair<int, int>(1, 3));
 
-    ASSERT_TRUE(x2.extent(0) == 2);
-    ASSERT_TRUE(x2.extent(1) == 2);
-    ASSERT_TRUE(&x2(0, 0) == &x_static_8(0, 1, 2, 1, 0, 1, 2, 1));
-    ASSERT_TRUE(&x2(1, 0) == &x_static_8(0, 1, 2, 2, 0, 1, 2, 1));
-    ASSERT_TRUE(&x2(0, 1) == &x_static_8(0, 1, 2, 1, 0, 1, 2, 2));
-    ASSERT_TRUE(&x2(1, 1) == &x_static_8(0, 1, 2, 2, 0, 1, 2, 2));
+    ASSERT_EQ(x2.extent(0), 2);
+    ASSERT_EQ(x2.extent(1), 2);
+    ASSERT_EQ(&x2(0, 0), &x_static_8(0, 1, 2, 1, 0, 1, 2, 1));
+    ASSERT_EQ(&x2(1, 0), &x_static_8(0, 1, 2, 2, 0, 1, 2, 1));
+    ASSERT_EQ(&x2(0, 1), &x_static_8(0, 1, 2, 1, 0, 1, 2, 2));
+    ASSERT_EQ(&x2(1, 1), &x_static_8(0, 1, 2, 2, 0, 1, 2, 2));
 
     // Kokkos::View< int**, Kokkos::LayoutRight, Space > error_2 =
     Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2;
     make_subview(use_constr, sx2, x_static_8, 1, Kokkos::pair<int, int>(0, 2),
                  2, 3, Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
-    ASSERT_TRUE(sx2.extent(0) == 2);
-    ASSERT_TRUE(sx2.extent(1) == 2);
-    ASSERT_TRUE(&sx2(0, 0) == &x_static_8(1, 0, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 0) == &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(0, 1) == &x_static_8(1, 0, 2, 3, 1, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 1) == &x_static_8(1, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(sx2.extent(0), 2);
+    ASSERT_EQ(sx2.extent(1), 2);
+    ASSERT_EQ(&sx2(0, 0), &x_static_8(1, 0, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 0), &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 1), &x_static_8(1, 0, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 1), &x_static_8(1, 1, 2, 3, 1, 1, 2, 3));
 
     Kokkos::View<int****, Kokkos::LayoutStride, Space> sx4;
     make_subview(use_constr, sx4, x_static_8, 0,
@@ -696,17 +695,16 @@ void test_right_0(bool use_constr) {
                  2, Kokkos::pair<int, int>(2, 4) /* of [5] */
     );
 
-    ASSERT_TRUE(sx4.extent(0) == 2);
-    ASSERT_TRUE(sx4.extent(1) == 2);
-    ASSERT_TRUE(sx4.extent(2) == 2);
-    ASSERT_TRUE(sx4.extent(3) == 2);
+    ASSERT_EQ(sx4.extent(0), 2);
+    ASSERT_EQ(sx4.extent(1), 2);
+    ASSERT_EQ(sx4.extent(2), 2);
+    ASSERT_EQ(sx4.extent(3), 2);
     for (int i0 = 0; i0 < (int)sx4.extent(0); ++i0)
       for (int i1 = 0; i1 < (int)sx4.extent(1); ++i1)
         for (int i2 = 0; i2 < (int)sx4.extent(2); ++i2)
           for (int i3 = 0; i3 < (int)sx4.extent(3); ++i3) {
-            ASSERT_TRUE(&sx4(i0, i1, i2, i3) == &x_static_8(0, 0 + i0, 1,
-                                                            1 + i1, 1, 0 + i2,
-                                                            2, 2 + i3));
+            ASSERT_EQ(&sx4(i0, i1, i2, i3),
+                      &x_static_8(0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3));
           }
   }
 }
@@ -722,21 +720,21 @@ void test_right_1(bool use_constr) {
   using view_type =
       Kokkos::View<int*** * [2][3][4][5], Kokkos::LayoutRight, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_type x8("x_right_8", 2, 3, 4, 5);
 
     Kokkos::View<int, Kokkos::LayoutRight, Space> x0;
     make_subview(use_constr, x0, x8, 0, 0, 0, 0, 0, 0, 0, 0);
 
-    ASSERT_TRUE(&x0() == &x8(0, 0, 0, 0, 0, 0, 0, 0));
+    ASSERT_EQ(&x0(), &x8(0, 0, 0, 0, 0, 0, 0, 0));
 
     Kokkos::View<int*, Kokkos::LayoutRight, Space> x1;
     make_subview(use_constr, x1, x8, 0, 1, 2, 3, 0, 1, 2,
                  Kokkos::pair<int, int>(1, 3));
 
-    ASSERT_TRUE(&x1(0) == &x8(0, 1, 2, 3, 0, 1, 2, 1));
-    ASSERT_TRUE(&x1(1) == &x8(0, 1, 2, 3, 0, 1, 2, 2));
+    ASSERT_EQ(&x1(0), &x8(0, 1, 2, 3, 0, 1, 2, 1));
+    ASSERT_EQ(&x1(1), &x8(0, 1, 2, 3, 0, 1, 2, 2));
 
     Kokkos::View<int*, Kokkos::LayoutRight, Space> x1_deg1;
     make_subview(use_constr, x1_deg1, x8, 0, 1, 2, 3, 0, 1, 2,
@@ -747,10 +745,10 @@ void test_right_1(bool use_constr) {
     make_subview(use_constr, x2, x8, 0, 1, 2, Kokkos::pair<int, int>(1, 3), 0,
                  1, 2, Kokkos::pair<int, int>(1, 3));
 
-    ASSERT_TRUE(&x2(0, 0) == &x8(0, 1, 2, 1, 0, 1, 2, 1));
-    ASSERT_TRUE(&x2(1, 0) == &x8(0, 1, 2, 2, 0, 1, 2, 1));
-    ASSERT_TRUE(&x2(0, 1) == &x8(0, 1, 2, 1, 0, 1, 2, 2));
-    ASSERT_TRUE(&x2(1, 1) == &x8(0, 1, 2, 2, 0, 1, 2, 2));
+    ASSERT_EQ(&x2(0, 0), &x8(0, 1, 2, 1, 0, 1, 2, 1));
+    ASSERT_EQ(&x2(1, 0), &x8(0, 1, 2, 2, 0, 1, 2, 1));
+    ASSERT_EQ(&x2(0, 1), &x8(0, 1, 2, 1, 0, 1, 2, 2));
+    ASSERT_EQ(&x2(1, 1), &x8(0, 1, 2, 2, 0, 1, 2, 2));
 
     Kokkos::View<int**, Kokkos::LayoutRight, Space> x2_deg2;
     make_subview(use_constr, x2_deg2, x8, 0, 1, 2, Kokkos::pair<int, int>(1, 3),
@@ -762,10 +760,10 @@ void test_right_1(bool use_constr) {
     make_subview(use_constr, sx2, x8, 1, Kokkos::pair<int, int>(0, 2), 2, 3,
                  Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
-    ASSERT_TRUE(&sx2(0, 0) == &x8(1, 0, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 0) == &x8(1, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(0, 1) == &x8(1, 0, 2, 3, 1, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 1) == &x8(1, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 0), &x8(1, 0, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 0), &x8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 1), &x8(1, 0, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 1), &x8(1, 1, 2, 3, 1, 1, 2, 3));
 
     Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2_deg;
     make_subview(use_constr, sx2_deg, x8, 1, Kokkos::pair<int, int>(0, 2), 2, 3,
@@ -803,8 +801,8 @@ template <class Space>
 void test_right_3() {
   using view_type = Kokkos::View<int**, Kokkos::LayoutRight, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_type xm("x4", 10, 5);
 
     ASSERT_TRUE(xm.span_is_contiguous());
@@ -813,14 +811,14 @@ void test_right_3() {
         Kokkos::subview(xm, 5, 3);
 
     ASSERT_TRUE(x0.span_is_contiguous());
-    ASSERT_TRUE(&x0() == &xm(5, 3));
+    ASSERT_EQ(&x0(), &xm(5, 3));
 
     Kokkos::View<int*, Kokkos::LayoutRight, Space> x1 =
         Kokkos::subview(xm, 3, Kokkos::ALL);
 
     ASSERT_TRUE(x1.span_is_contiguous());
     for (int i = 0; i < int(xm.extent(1)); ++i) {
-      ASSERT_TRUE(&x1(i) == &xm(3, i));
+      ASSERT_EQ(&x1(i), &xm(3, i));
     }
 
     Kokkos::View<int**, Kokkos::LayoutRight, Space> x2c =
@@ -829,7 +827,7 @@ void test_right_3() {
     ASSERT_TRUE(x2c.span_is_contiguous());
     for (int j = 0; j < int(x2c.extent(1)); ++j)
       for (int i = 0; i < int(x2c.extent(0)); ++i) {
-        ASSERT_TRUE(&x2c(i, j) == &xm(1 + i, j));
+        ASSERT_EQ(&x2c(i, j), &xm(1 + i, j));
       }
 
     Kokkos::View<int**, Kokkos::LayoutRight, Space> x2 =
@@ -838,20 +836,20 @@ void test_right_3() {
     ASSERT_TRUE(!x2.span_is_contiguous());
     for (int j = 0; j < int(x2.extent(1)); ++j)
       for (int i = 0; i < int(x2.extent(0)); ++i) {
-        ASSERT_TRUE(&x2(i, j) == &xm(i, 2 + j));
+        ASSERT_EQ(&x2(i, j), &xm(i, 2 + j));
       }
 
     Kokkos::View<int**, Kokkos::LayoutRight, Space> x2_n1 =
         Kokkos::subview(xm, std::pair<int, int>(1, 1), Kokkos::ALL);
 
-    ASSERT_TRUE(x2_n1.extent(0) == 0);
-    ASSERT_TRUE(x2_n1.extent(1) == xm.extent(1));
+    ASSERT_EQ(x2_n1.extent(0), 0);
+    ASSERT_EQ(x2_n1.extent(1), xm.extent(1));
 
     Kokkos::View<int**, Kokkos::LayoutRight, Space> x2_n2 =
         Kokkos::subview(xm, Kokkos::ALL, std::pair<int, int>(1, 1));
 
-    ASSERT_TRUE(x2_n2.extent(0) == xm.extent(0));
-    ASSERT_TRUE(x2_n2.extent(1) == 0);
+    ASSERT_EQ(x2_n2.extent(0), xm.extent(0));
+    ASSERT_EQ(x2_n2.extent(1), 0);
   }
 }
 
@@ -979,7 +977,7 @@ struct CheckSubviewCorrectness_1D_1D {
     int errors = 0;
     Kokkos::parallel_reduce("CheckSubView_1D_1D", policy_t(0, b.size()), *this,
                             errors);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -1005,7 +1003,7 @@ struct CheckSubviewCorrectness_1D_2D {
     int errors = 0;
     Kokkos::parallel_reduce("CheckSubView_1D_2D", policy_t(0, b.size()), *this,
                             errors);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -1033,7 +1031,7 @@ struct CheckSubviewCorrectness_2D_3D {
     int errors = 0;
     Kokkos::parallel_reduce("CheckSubView_2D_3D", policy_t(0, b.size()), *this,
                             errors);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -1068,7 +1066,7 @@ struct CheckSubviewCorrectness_3D_3D {
     int errors = 0;
     Kokkos::parallel_reduce("CheckSubView_3D_3D", policy_t(0, b.size()), *this,
                             errors);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -1107,7 +1105,7 @@ struct CheckSubviewCorrectness_3D_4D {
     int errors = 0;
     Kokkos::parallel_reduce("CheckSubView_3D_4D", policy_t(0, b.size()), *this,
                             errors);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -1165,7 +1163,7 @@ struct CheckSubviewCorrectness_3D_5D {
     int errors = 0;
     Kokkos::parallel_reduce("CheckSubView_3D_5D", policy_t(0, b.size()), *this,
                             errors);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
diff --git a/packages/kokkos/core/unit_test/TestView_64bit.hpp b/packages/kokkos/core/unit_test/TestView_64bit.hpp
index 50626718b5774ddefa03a453402564986e831ed1..174a07ac1d5b6ca6e9ddc145617ea86bf51de314 100644
--- a/packages/kokkos/core/unit_test/TestView_64bit.hpp
+++ b/packages/kokkos/core/unit_test/TestView_64bit.hpp
@@ -49,9 +49,9 @@ namespace Test {
 template <class Device>
 void test_64bit() {
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
-  // FIXME_SYCL The SYCL CUDA backend throws an error
+  // We are running out of device memory on Intel GPUs
 #ifdef KOKKOS_ENABLE_SYCL
-  int64_t N = 1000000000;
+  int64_t N = 4000000000;
 #else
   int64_t N = 5000000000;
 #endif
@@ -60,7 +60,7 @@ void test_64bit() {
     Kokkos::parallel_reduce(
         Kokkos::RangePolicy<typename Device::execution_space,
                             Kokkos::IndexType<int64_t>>(0, N),
-        KOKKOS_LAMBDA(const int64_t& /*i*/, int64_t& lsum) { lsum += 1; }, sum);
+        KOKKOS_LAMBDA(const int64_t&, int64_t& lsum) { lsum += 1; }, sum);
     ASSERT_EQ(N, sum);
   }
   {
@@ -111,7 +111,12 @@ void test_64bit() {
     ASSERT_EQ(N0 * N1, sum);
   }
   {
-    int N0    = 1024 * 1024 * 1500;
+// We are running out of device memory on Intel GPUs
+#ifdef KOKKOS_ENABLE_SYCL
+    int64_t N0 = 1024 * 1024 * 900;
+#else
+    int N0 = 1024 * 1024 * 1500;
+#endif
     int64_t P = 1713091;
     Kokkos::View<int*, Device> a("A", N0);
     Kokkos::parallel_for(
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_a.cpp b/packages/kokkos/core/unit_test/category_files/TestSYCLHostUSM_Category.hpp
similarity index 89%
rename from packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_a.cpp
rename to packages/kokkos/core/unit_test/category_files/TestSYCLHostUSM_Category.hpp
index 316a2b5d0fe0dba2c9b74f3f6f7a6d61342d2c4c..0287829fd61e88d19449e7e82d0b9727a5413fb3 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_a.cpp
+++ b/packages/kokkos/core/unit_test/category_files/TestSYCLHostUSM_Category.hpp
@@ -42,5 +42,12 @@
 //@HEADER
 */
 
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewAPI_a.hpp>
+#ifndef KOKKOS_TEST_SYCL_HOST_USM_SPACE_HPP
+#define KOKKOS_TEST_SYCL_HOST_USM_SPACE_HPP
+
+#include <gtest/gtest.h>
+
+#define TEST_CATEGORY sycl_host_usm
+#define TEST_EXECSPACE Kokkos::Experimental::SYCLHostUSMSpace
+
+#endif
diff --git a/packages/kokkos/core/unit_test/category_files/TestSYCLSharedUSMSpace_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestSYCLSharedUSM_Category.hpp
similarity index 100%
rename from packages/kokkos/core/unit_test/category_files/TestSYCLSharedUSMSpace_Category.hpp
rename to packages/kokkos/core/unit_test/category_files/TestSYCLSharedUSM_Category.hpp
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_d.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_d.cpp
deleted file mode 100644
index bab29610a3d4ad2e812405ba96ed06c7e2dfb3b8..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_d.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewAPI_d.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_e.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_e.cpp
deleted file mode 100644
index fd227186d5668239b9d9fe3f6a1ae2b3d5510b32..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_e.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewAPI_e.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_a.cpp
deleted file mode 100644
index 669761df979cfd1458f1d5ea78acfb5738af0d38..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewCopy_a.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_b.cpp
deleted file mode 100644
index d367fd7e051f49495ce747f6f490bad795f94d86..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewCopy_b.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_a.cpp
deleted file mode 100644
index 01b284b2f562299b4f23cc197693c2baad40f38e..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewMapping_a.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_b.cpp
deleted file mode 100644
index e15228b1d772a5dba97ee434e17fdb18188a709a..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewMapping_b.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_subview.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_subview.cpp
deleted file mode 100644
index 52bbd42f292f4b865def36856913dfc6bbe0028f..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_subview.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewMapping_subview.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_a.cpp
deleted file mode 100644
index 4aeac8f13f4d28672c671a51c1eacfedbf0e92fd..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewAPI_a.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_b.cpp
deleted file mode 100644
index e5cb0103424fd022290998307f086aedaea0cb29..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewAPI_b.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_c.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_c.cpp
deleted file mode 100644
index a52fcb833ed2a0e959a25e36195460c1ed914a78..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_c.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewAPI_c.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_d.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_d.cpp
deleted file mode 100644
index e345cd9667526671ef898a0d1247343b47f6296c..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_d.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewAPI_d.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_e.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_e.cpp
deleted file mode 100644
index 61547df4f523969f8c93da8315fddb4467e5ade9..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_e.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewAPI_e.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_a.cpp
deleted file mode 100644
index 75a769bb947485e6e7459c1cb95b7b3b1c26f9b1..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewCopy_a.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_b.cpp
deleted file mode 100644
index 7d09f5c9f397b3723599aec64c3c50a6aa77a769..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewCopy_b.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_a.cpp
deleted file mode 100644
index ea03f43bd69a318095e6277f4db226241fc9a482..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewMapping_a.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_b.cpp
deleted file mode 100644
index 1f754e8f4996cbc3c0fbefd7000bff65451b19f0..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewMapping_b.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_subview.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_subview.cpp
deleted file mode 100644
index 4af7057d2aa47db99a8325159e0ee737feff7767..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_subview.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewMapping_subview.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp
index ee7181e1180fdb887a87190605565e42e897409c..d09d4edfdad12e7db332c279398247bfda9ca80a 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp
@@ -60,7 +60,7 @@ __global__ void offset(int* p) {
 // Cuda.
 TEST(cuda, raw_cuda_interop) {
   int* p;
-  CUDA_SAFE_CALL(cudaMalloc(&p, sizeof(int) * 100));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(&p, sizeof(int) * 100));
   Kokkos::InitArguments arguments{-1, -1, -1, false};
   Kokkos::initialize(arguments);
 
@@ -70,11 +70,11 @@ TEST(cuda, raw_cuda_interop) {
   Kokkos::finalize();
 
   offset<<<100, 64>>>(p);
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
 
   std::array<int, 100> h_p;
   cudaMemcpy(h_p.data(), p, sizeof(int) * 100, cudaMemcpyDefault);
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
   int64_t sum        = 0;
   int64_t sum_expect = 0;
   for (int i = 0; i < 100; i++) {
@@ -83,6 +83,6 @@ TEST(cuda, raw_cuda_interop) {
   }
 
   ASSERT_EQ(sum, sum_expect);
-  CUDA_SAFE_CALL(cudaFree(p));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(p));
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp
index 526b985c00f2eec2eab6cafb8e862eff5024d575..13388b4c5472c5441d33e9fbfb8f99a995bdcdf0 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp
@@ -99,12 +99,12 @@ TEST(cuda, raw_cuda_streams) {
   }
   Kokkos::finalize();
   offset_streams<<<100, 64, 0, stream>>>(p);
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
   cudaStreamDestroy(stream);
 
   int h_p[100];
   cudaMemcpy(h_p, p, sizeof(int) * 100, cudaMemcpyDefault);
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
   int64_t sum        = 0;
   int64_t sum_expect = 0;
   for (int i = 0; i < 100; i++) {
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
index 646b37908654d2af6327158cb49f7d4257e8f8bf..2fa61d43120d338bac3c475fc7cf35e9aeb06776 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
@@ -181,37 +181,33 @@ TEST(cuda, space_access) {
   //--------------------------------------
 
   static_assert(
-      !Kokkos::Impl::SpaceAccessibility<Kokkos::Cuda,
-                                        Kokkos::HostSpace>::accessible,
+      !Kokkos::SpaceAccessibility<Kokkos::Cuda, Kokkos::HostSpace>::accessible,
       "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<Kokkos::Cuda,
-                                                 Kokkos::CudaSpace>::accessible,
-                "");
-
   static_assert(
-      Kokkos::Impl::SpaceAccessibility<Kokkos::Cuda,
-                                       Kokkos::CudaUVMSpace>::accessible,
+      Kokkos::SpaceAccessibility<Kokkos::Cuda, Kokkos::CudaSpace>::accessible,
       "");
 
-  static_assert(
-      Kokkos::Impl::SpaceAccessibility<Kokkos::Cuda,
-                                       Kokkos::CudaHostPinnedSpace>::accessible,
-      "");
+  static_assert(Kokkos::SpaceAccessibility<Kokkos::Cuda,
+                                           Kokkos::CudaUVMSpace>::accessible,
+                "");
 
   static_assert(
-      !Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,
-                                        Kokkos::CudaSpace>::accessible,
+      Kokkos::SpaceAccessibility<Kokkos::Cuda,
+                                 Kokkos::CudaHostPinnedSpace>::accessible,
       "");
 
-  static_assert(
-      Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,
-                                       Kokkos::CudaUVMSpace>::accessible,
-      "");
+  static_assert(!Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                            Kokkos::CudaSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                           Kokkos::CudaUVMSpace>::accessible,
+                "");
 
   static_assert(
-      Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,
-                                       Kokkos::CudaHostPinnedSpace>::accessible,
+      Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 Kokkos::CudaHostPinnedSpace>::accessible,
       "");
 
   static_assert(std::is_same<Kokkos::Impl::HostMirror<Kokkos::CudaSpace>::Space,
@@ -235,23 +231,23 @@ TEST(cuda, space_access) {
                                             Kokkos::CudaUVMSpace>>::value,
                 "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
-                    Kokkos::Impl::HostMirror<Kokkos::Cuda>::Space,
-                    Kokkos::HostSpace>::accessible,
-                "");
+  static_assert(
+      Kokkos::SpaceAccessibility<Kokkos::Impl::HostMirror<Kokkos::Cuda>::Space,
+                                 Kokkos::HostSpace>::accessible,
+      "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
+  static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::Impl::HostMirror<Kokkos::CudaSpace>::Space,
                     Kokkos::HostSpace>::accessible,
                 "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
+  static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::Impl::HostMirror<Kokkos::CudaUVMSpace>::Space,
                     Kokkos::HostSpace>::accessible,
                 "");
 
   static_assert(
-      Kokkos::Impl::SpaceAccessibility<
+      Kokkos::SpaceAccessibility<
           Kokkos::Impl::HostMirror<Kokkos::CudaHostPinnedSpace>::Space,
           Kokkos::HostSpace>::accessible,
       "");
@@ -265,8 +261,8 @@ TEST(cuda, space_access) {
 
 TEST(cuda, uvm) {
   if (Kokkos::CudaUVMSpace::available()) {
-    int *uvm_ptr = (int *)Kokkos::kokkos_malloc<Kokkos::CudaUVMSpace>(
-        "uvm_ptr", sizeof(int));
+    int *uvm_ptr = static_cast<int *>(
+        Kokkos::kokkos_malloc<Kokkos::CudaUVMSpace>("uvm_ptr", sizeof(int)));
 
     *uvm_ptr = 42;
 
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp
index 5dcbe566e299c0f013843216b0854dc51582dd6d..6d6ff0a67bc151421556fca487f30677a5119c33 100644
--- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp
@@ -59,17 +59,17 @@ TEST(TEST_CATEGORY, host_space_access) {
   using mirror_space =
       Kokkos::Impl::HostMirror<Kokkos::DefaultExecutionSpace>::Space;
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<host_exec_space,
-                                                 Kokkos::HostSpace>::accessible,
+  static_assert(Kokkos::SpaceAccessibility<host_exec_space,
+                                           Kokkos::HostSpace>::accessible,
                 "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<device_space,
-                                                 Kokkos::HostSpace>::accessible,
-                "");
+  static_assert(
+      Kokkos::SpaceAccessibility<device_space, Kokkos::HostSpace>::accessible,
+      "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<mirror_space,
-                                                 Kokkos::HostSpace>::accessible,
-                "");
+  static_assert(
+      Kokkos::SpaceAccessibility<mirror_space, Kokkos::HostSpace>::accessible,
+      "");
 }
 
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp
index bcd49e69bd3af022ede0ca0a188066288c9b1d35..c74090fff93c8b9a529fad2e5d156d4cad55b954 100644
--- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp
@@ -54,12 +54,13 @@
 namespace Test {
 
 TEST(defaultdevicetype, malloc) {
-  int* data = (int*)Kokkos::kokkos_malloc(100 * sizeof(int));
-  ASSERT_NO_THROW(data = (int*)Kokkos::kokkos_realloc(data, 120 * sizeof(int)));
+  int* data = static_cast<int*>(Kokkos::kokkos_malloc(100 * sizeof(int)));
+  ASSERT_NO_THROW(data = static_cast<int*>(
+                      Kokkos::kokkos_realloc(data, 120 * sizeof(int))));
   Kokkos::kokkos_free(data);
 
-  int* data2 = (int*)Kokkos::kokkos_malloc(0);
-  ASSERT_TRUE(data2 == nullptr);
+  int* data2 = static_cast<int*>(Kokkos::kokkos_malloc(0));
+  ASSERT_EQ(data2, nullptr);
   Kokkos::kokkos_free(data2);
 }
 
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_a.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_a.cpp
deleted file mode 100644
index 02157836b3f6075c6c18e2919d93ed4b541dbab8..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewAPI_a.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_b.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_b.cpp
deleted file mode 100644
index 80e2fe3f93716c23979ede23aa81de9b2f694c9e..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewAPI_b.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_c.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_c.cpp
deleted file mode 100644
index 9694e33ca0ce0f5c2fc6214613f4ae2f03c9750d..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_c.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewAPI_c.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_d.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_d.cpp
deleted file mode 100644
index 0d773494ac6236ce0274cc844fb3369aec81d51d..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_d.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewAPI_d.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_e.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_e.cpp
deleted file mode 100644
index cbbbc810b0e8e588be2892b83279a4137675de66..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_e.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewAPI_e.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_a.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_a.cpp
deleted file mode 100644
index 444a3e6e95d2a62c1ad0e8bedba3767503dd4687..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewCopy_a.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_b.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_b.cpp
deleted file mode 100644
index f1f90e7acf13c7aaa4820f5bd50ecc403f2d6f5f..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewCopy_b.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_a.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_a.cpp
deleted file mode 100644
index 5e83121e341db1da440c65cd5dce84dc1a6f6259..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewMapping_a.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_b.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_b.cpp
deleted file mode 100644
index c024143d6c7b735dfa3b897e0a4503ee50e4caec..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewMapping_b.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_subview.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_subview.cpp
deleted file mode 100644
index dcd6c1dc435982fdf44950c3b606847c29c30b37..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_subview.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewMapping_subview.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_AsyncLauncher.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_AsyncLauncher.cpp
index 0a243e0e8e89c0ef5a7cec6195837909d092bc2a..854f916ba3dad7777f453694ea708a0754872d3d 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIP_AsyncLauncher.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_AsyncLauncher.cpp
@@ -66,8 +66,8 @@ struct TestAsyncLauncher {
 
 TEST(hip, async_launcher) {
   size_t *flag;
-  HIP_SAFE_CALL(hipMalloc(&flag, sizeof(size_t)));
-  HIP_SAFE_CALL(hipMemset(flag, 0, sizeof(size_t)));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&flag, sizeof(size_t)));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMemset(flag, 0, sizeof(size_t)));
   // launch # of cycles * 1000 kernels w/ distinct values
   auto space        = Kokkos::Experimental::HIP();
   auto instance     = space.impl_internal_space_instance();
@@ -80,10 +80,10 @@ TEST(hip, async_launcher) {
   // the sum below should fail
   instance->fence();
   size_t h_flag;
-  HIP_SAFE_CALL(
+  KOKKOS_IMPL_HIP_SAFE_CALL(
       hipMemcpy(&h_flag, flag, sizeof(size_t), hipMemcpyHostToDevice));
   ASSERT_EQ(h_flag, (nkernels * (nkernels - 1)) / 2);
-  HIP_SAFE_CALL(hipFree(flag));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(flag));
 }
 
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_BlocksizeDeduction.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_BlocksizeDeduction.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f382e5b568b85cdac1f4943c72aa02be73472d7a
--- /dev/null
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_BlocksizeDeduction.cpp
@@ -0,0 +1,99 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <TestHIP_Category.hpp>
+
+namespace Test {
+
+struct TestNone {
+  Kokkos::View<size_t*, TEST_EXECSPACE> view;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const { view(i) = i; }
+
+  TestNone() { view = Kokkos::View<size_t*, TEST_EXECSPACE>("dummy", 1); }
+};
+
+struct TestSpiller {
+  Kokkos::View<size_t*, TEST_EXECSPACE> view;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    size_t array[1000] = {0};
+    // and update flag
+    size_t value = 0;
+    for (int ii = i; ii < 1000; ++ii) {
+      array[ii] = value;
+      value += ii;
+    }
+    for (int ii = i; ii < 1000; ++ii) {
+      value *= array[ii];
+    }
+    Kokkos::atomic_add(&view[0], value);
+  }
+
+  TestSpiller() { view = Kokkos::View<size_t*, TEST_EXECSPACE>("dummy", 1); }
+};
+
+TEST(hip, preferred_blocksize_deduction) {
+  using execution_space =
+      typename Kokkos::Impl::FunctorPolicyExecutionSpace<TestSpiller,
+                                                         void>::execution_space;
+  using policy = Kokkos::RangePolicy<execution_space>;
+
+  {
+    using DriverType = Kokkos::Impl::ParallelFor<TestNone, policy>;
+    ASSERT_TRUE(Kokkos::Experimental::Impl::HIPParallelLaunch<
+                    DriverType>::get_scratch_size() == 0);
+  }
+
+  {
+    using DriverType = Kokkos::Impl::ParallelFor<TestSpiller, policy>;
+    ASSERT_TRUE(Kokkos::Experimental::Impl::HIPParallelLaunch<
+                    DriverType>::get_scratch_size() > 0);
+  }
+}
+
+}  // namespace Test
diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp
index 3a76ca148cf683a83b84d351e4ebd8b2f7cdec94..73d08abca9d396464e8ba538e6e228c4ad70628b 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp
@@ -60,7 +60,7 @@ __global__ void offset(int* p) {
 // HIP.
 TEST(hip, raw_hip_interop) {
   int* p;
-  HIP_SAFE_CALL(hipMalloc(&p, sizeof(int) * 100));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&p, sizeof(int) * 100));
   Kokkos::InitArguments arguments{-1, -1, -1, false};
   Kokkos::initialize(arguments);
 
@@ -70,11 +70,12 @@ TEST(hip, raw_hip_interop) {
   Kokkos::finalize();
 
   offset<<<dim3(100), dim3(100), 0, nullptr>>>(p);
-  HIP_SAFE_CALL(hipDeviceSynchronize());
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize());
 
   std::array<int, 100> h_p;
-  HIP_SAFE_CALL(hipMemcpy(h_p.data(), p, sizeof(int) * 100, hipMemcpyDefault));
-  HIP_SAFE_CALL(hipDeviceSynchronize());
+  KOKKOS_IMPL_HIP_SAFE_CALL(
+      hipMemcpy(h_p.data(), p, sizeof(int) * 100, hipMemcpyDefault));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize());
   int64_t sum        = 0;
   int64_t sum_expect = 0;
   for (int i = 0; i < 100; i++) {
@@ -83,6 +84,6 @@ TEST(hip, raw_hip_interop) {
   }
 
   ASSERT_EQ(sum, sum_expect);
-  HIP_SAFE_CALL(hipFree(p));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(p));
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp
index 8e0880ddbd0b15524be75ab97b90044e5315a8ff..69ca62df6a3a3e95cc77fb4354b96eb6a16e0c2d 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp
@@ -51,11 +51,11 @@ namespace Test {
 // bound in HIP due to an error when computing the block size.
 TEST(hip, raw_hip_streams) {
   hipStream_t stream;
-  HIP_SAFE_CALL(hipStreamCreate(&stream));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream));
   Kokkos::InitArguments arguments{-1, -1, -1, false};
   Kokkos::initialize(arguments);
   int* p;
-  HIP_SAFE_CALL(hipMalloc(&p, sizeof(int) * 100));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&p, sizeof(int) * 100));
   using MemorySpace = typename TEST_EXECSPACE::memory_space;
 
   {
@@ -97,12 +97,13 @@ TEST(hip, raw_hip_streams) {
   }
   Kokkos::finalize();
   offset_streams<<<100, 64, 0, stream>>>(p);
-  HIP_SAFE_CALL(hipDeviceSynchronize());
-  HIP_SAFE_CALL(hipStreamDestroy(stream));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize());
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(stream));
 
   int h_p[100];
-  HIP_SAFE_CALL(hipMemcpy(h_p, p, sizeof(int) * 100, hipMemcpyDefault));
-  HIP_SAFE_CALL(hipDeviceSynchronize());
+  KOKKOS_IMPL_HIP_SAFE_CALL(
+      hipMemcpy(h_p, p, sizeof(int) * 100, hipMemcpyDefault));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize());
   int64_t sum        = 0;
   int64_t sum_expect = 0;
   for (int i = 0; i < 100; i++) {
diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp
index ae1de8ea2d304e41d672ff2e136d16c86cbb8068..d20ea877ec9e1f4aee9f0df5c1d807790cdc932e 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp
@@ -129,27 +129,26 @@ TEST(hip, space_access) {
 
   //--------------------------------------
 
+  static_assert(!Kokkos::SpaceAccessibility<Kokkos::Experimental::HIP,
+                                            Kokkos::HostSpace>::accessible,
+                "");
+
   static_assert(
-      !Kokkos::Impl::SpaceAccessibility<Kokkos::Experimental::HIP,
-                                        Kokkos::HostSpace>::accessible,
+      Kokkos::SpaceAccessibility<Kokkos::Experimental::HIP,
+                                 Kokkos::Experimental::HIPSpace>::accessible,
       "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
-                    Kokkos::Experimental::HIP,
-                    Kokkos::Experimental::HIPSpace>::accessible,
-                "");
-
-  static_assert(Kokkos::Impl::SpaceAccessibility<
+  static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::Experimental::HIP,
                     Kokkos::Experimental::HIPHostPinnedSpace>::accessible,
                 "");
 
   static_assert(
-      !Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, Kokkos::Experimental::HIPSpace>::accessible,
+      !Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                  Kokkos::Experimental::HIPSpace>::accessible,
       "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
+  static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::HostSpace,
                     Kokkos::Experimental::HIPHostPinnedSpace>::accessible,
                 "");
@@ -166,18 +165,18 @@ TEST(hip, space_access) {
                    Kokkos::Experimental::HIPHostPinnedSpace>::value,
       "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
+  static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::Impl::HostMirror<Kokkos::Experimental::HIP>::Space,
                     Kokkos::HostSpace>::accessible,
                 "");
 
   static_assert(
-      Kokkos::Impl::SpaceAccessibility<
+      Kokkos::SpaceAccessibility<
           Kokkos::Impl::HostMirror<Kokkos::Experimental::HIPSpace>::Space,
           Kokkos::HostSpace>::accessible,
       "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
+  static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::Impl::HostMirror<
                         Kokkos::Experimental::HIPHostPinnedSpace>::Space,
                     Kokkos::HostSpace>::accessible,
diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_TeamScratchStreams.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_TeamScratchStreams.cpp
index db360a99d3d60977cf06479e7662e21350dd5f99..86b2fab3c7e3fc0b53b309646add9f4817b804f2 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIP_TeamScratchStreams.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_TeamScratchStreams.cpp
@@ -104,7 +104,7 @@ void hip_stream_scratch_test(
   hipStream_t stream[4];
   Kokkos::Experimental::HIP hip[4];
   for (int i = 0; i < K; i++) {
-    HIP_SAFE_CALL(hipStreamCreate(&stream[i]));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream[i]));
     hip[i] = Kokkos::Experimental::HIP(stream[i]);
   }
 // Test that growing scratch size in subsequent calls doesn't crash things
@@ -131,7 +131,7 @@ void hip_stream_scratch_test(
   Kokkos::fence();
   for (int i = 0; i < K; i++) {
     hip[i] = Kokkos::Experimental::HIP();
-    HIP_SAFE_CALL(hipStreamDestroy(stream[i]));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(stream[i]));
   }
 }
 }  // namespace Impl
diff --git a/packages/kokkos/core/unit_test/incremental/Test01_execspace.hpp b/packages/kokkos/core/unit_test/incremental/Test01_execspace.hpp
index 419486d7a84673dacd48e7bf2513e054106bab4c..4d5ca46ba6ee6a41c8a9461bc5c26b36e5996a55 100644
--- a/packages/kokkos/core/unit_test/incremental/Test01_execspace.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test01_execspace.hpp
@@ -88,7 +88,7 @@ struct TestIncrExecSpace {
     ExecSpace().fence();
 
     auto concurrency = ExecSpace().concurrency();
-    ASSERT_TRUE(concurrency > 0);
+    ASSERT_GT(concurrency, 0);
 
     int in_parallel = ExecSpace::in_parallel();
     ASSERT_FALSE(in_parallel);
@@ -107,5 +107,7 @@ TEST(TEST_CATEGORY, IncrTest_01_execspace) {
   ASSERT_TRUE(Kokkos::is_execution_space<TEST_EXECSPACE>::value);
   ASSERT_FALSE(Kokkos::is_execution_space<
                TestIncrExecSpaceTypedef<TEST_EXECSPACE>>::value);
+  TestIncrExecSpace<TEST_EXECSPACE> test;
+  test.testit();
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/incremental/Test02_atomic_host.hpp b/packages/kokkos/core/unit_test/incremental/Test02_atomic_host.hpp
index ff4fb6a89f4d380d0693e8697e27fbf5bde2f4d0..d40cb4dbe7f77627429f9880ce4981b38f414c9d 100644
--- a/packages/kokkos/core/unit_test/incremental/Test02_atomic_host.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test02_atomic_host.hpp
@@ -78,7 +78,7 @@ struct TestIncrAtomic {
   }
 };
 
-TEST(TEST_CATEGORY, IncrTest_01_AtomicExchange) {
+TEST(TEST_CATEGORY, IncrTest_02_AtomicExchange) {
   TestIncrAtomic test;
   test.testExchange();
 }
diff --git a/packages/kokkos/core/unit_test/incremental/Test06_ParallelFor_MDRangePolicy.hpp b/packages/kokkos/core/unit_test/incremental/Test06_ParallelFor_MDRangePolicy.hpp
index 4adf9e058fd5b1a85b3f7e24cac530876b7251f3..4192d4abe865f10a43e9a87ed6ee4aa877974dc0 100644
--- a/packages/kokkos/core/unit_test/incremental/Test06_ParallelFor_MDRangePolicy.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test06_ParallelFor_MDRangePolicy.hpp
@@ -94,16 +94,16 @@ struct TestMDRangePolicy {
   using int_index = Kokkos::IndexType<int>;
 
   // An MDRangePolicy for 2 nested loops
-  using MDPolicyType_2D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<2>, int_index>;
+  using MDPolicyType_2D =
+      typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, int_index>;
 
   // An MDRangePolicy for 3 nested loops
-  using MDPolicyType_3D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<3>, int_index>;
+  using MDPolicyType_3D =
+      typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>, int_index>;
 
   // An MDRangePolicy for 4 nested loops
-  using MDPolicyType_4D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<4>, int_index>;
+  using MDPolicyType_4D =
+      typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<4>, int_index>;
 
   // Device and Host Data structure pointer
   value_type *deviceData, *hostData;
diff --git a/packages/kokkos/core/unit_test/incremental/Test08_deep_copy.hpp b/packages/kokkos/core/unit_test/incremental/Test08_deep_copy.hpp
index 5166f5a9f0de05b24166161654c9eaab4ff2ad82..6e8fc07b8de50e61e1139ad0b7ba6e2752e81229 100644
--- a/packages/kokkos/core/unit_test/incremental/Test08_deep_copy.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test08_deep_copy.hpp
@@ -61,17 +61,17 @@ const int M      = 10;
 template <class ExecSpace>
 struct TestMDRangePolicy {
   // 2D View
-  using View_2D      = typename Kokkos::View<value_type **, ExecSpace>;
+  using View_2D      = Kokkos::View<value_type **, ExecSpace>;
   using Host_View_2D = typename View_2D::HostMirror;
   Host_View_2D hostDataView_2D;
 
   // 3D View
-  using View_3D      = typename Kokkos::View<value_type ***, ExecSpace>;
+  using View_3D      = Kokkos::View<value_type ***, ExecSpace>;
   using Host_View_3D = typename View_3D::HostMirror;
   Host_View_3D hostDataView_3D;
 
   // 4D View
-  using View_4D      = typename Kokkos::View<value_type ****, ExecSpace>;
+  using View_4D      = Kokkos::View<value_type ****, ExecSpace>;
   using Host_View_4D = typename View_4D::HostMirror;
   Host_View_4D hostDataView_4D;
 
@@ -83,16 +83,16 @@ struct TestMDRangePolicy {
   using int_index = Kokkos::IndexType<int>;
 
   // An MDRangePolicy for 2 nested loops
-  using MDPolicyType_2D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<2>, int_index>;
+  using MDPolicyType_2D =
+      Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, int_index>;
 
   // An MDRangePolicy for 3 nested loops
-  using MDPolicyType_3D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<3>, int_index>;
+  using MDPolicyType_3D =
+      Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>, int_index>;
 
   // An MDRangePolicy for 4 nested loops
-  using MDPolicyType_4D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<4>, int_index>;
+  using MDPolicyType_4D =
+      Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<4>, int_index>;
 
   // compare and equal
   void compare_equal_2D() {
diff --git a/packages/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp b/packages/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp
index 5bf1860d8e4a6bcf739656bdc7e1f790ebf60512..ab1cd90d4bfa2e5f5800b8dd13f23a999d2a8fa0 100644
--- a/packages/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp
@@ -74,9 +74,15 @@ struct ThreadScratch {
     for (int i = 0; i < sY; ++i) v_S(i) = 0;
 
     Kokkos::parallel_for(Kokkos::TeamThreadRange(team, sX), [&](const int m) {
+    // FIXME_SYCL This deadlocks in the subgroup_barrier when running on CUDA
+    // devices.
+#ifdef KOKKOS_ENABLE_SYCL
+      for (int k = 0; k < sY; ++k) v_S(k) += sX * sY * n + sY * m + k;
+#else
       Kokkos::parallel_for(
           Kokkos::ThreadVectorRange(team, sY),
           [&](const int k) { v_S(k) += sX * sY * n + sY * m + k; });
+#endif
     });
 
     team.team_barrier();
@@ -93,7 +99,7 @@ struct ThreadScratch {
     int scratchSize = scratch_t::shmem_size(sY);
     // So this works with deprecated code enabled:
     policy_t policy =
-        policy_t(pN, Kokkos::AUTO)
+        policy_t(pN, Kokkos::AUTO, 1)
             .set_scratch_size(scratch_level, Kokkos::PerThread(scratchSize));
 
     int max_team_size = policy.team_size_max(*this, Kokkos::ParallelForTag());
diff --git a/packages/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp b/packages/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp
index b34f652e76d919f14c3afed0656b8bcd86dbc27f..d81822d0da9cfd67b25c8394886509407ecbfeb0 100644
--- a/packages/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp
@@ -68,7 +68,7 @@ struct TeamScratch {
 
     Kokkos::parallel_for(
         "Team",
-        policy_t(pN, Kokkos::AUTO)
+        policy_t(pN, Kokkos::AUTO, 1)
             .set_scratch_size(scratch_level, Kokkos::PerTeam(scratchSize)),
         KOKKOS_LAMBDA(const team_t &team) {
           // Allocate and use scratch pad memory
@@ -77,11 +77,20 @@ struct TeamScratch {
 
           Kokkos::parallel_for(
               Kokkos::TeamThreadRange(team, sX), [&](const int m) {
+      // FIXME_SYCL This deadlocks in the subgroup_barrier
+      // when running on CUDA devices.
+#ifdef KOKKOS_ENABLE_SYCL
+                for (int k = 0; k < sY; ++k) {
+                  v_S(m, k) =
+                      v_S.extent(0) * v_S.extent(1) * n + v_S.extent(1) * m + k;
+                }
+#else
                 Kokkos::parallel_for(
                     Kokkos::ThreadVectorRange(team, sY), [&](const int k) {
                       v_S(m, k) = v_S.extent(0) * v_S.extent(1) * n +
                                   v_S.extent(1) * m + k;
                     });
+#endif
               });
 
           team.team_barrier();
diff --git a/packages/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp b/packages/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp
index d227e834dc64607c4ca01127228527dc71e9e918..7d53b9fb208ea490626634c304dd130b74392461 100644
--- a/packages/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp
@@ -82,20 +82,20 @@ struct MyComplex {
 template <class ExecSpace>
 struct TestMDRangeReduce {
   // 1D  View of double
-  using View_1D = typename Kokkos::View<value_type*, ExecSpace>;
+  using View_1D = Kokkos::View<value_type*, ExecSpace>;
 
   // 2D  View of double
-  using View_2D = typename Kokkos::View<value_type**, ExecSpace>;
+  using View_2D = Kokkos::View<value_type**, ExecSpace>;
 
   // Index Type for the iterator
   using int_index = Kokkos::IndexType<int>;
 
   // An MDRangePolicy for 2 nested loops
-  using MDPolicyType_2D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<2>, int_index>;
+  using MDPolicyType_2D =
+      Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, int_index>;
 
   //  1D - complex View
-  using Complex_View_1D = typename Kokkos::View<MyComplex*, ExecSpace>;
+  using Complex_View_1D = Kokkos::View<MyComplex*, ExecSpace>;
 
   // Reduction when ExecPolicy = MDRangePolicy and ReducerArgument =
   // scalar/1-element view
@@ -176,7 +176,11 @@ struct TestMDRangeReduce {
 TEST(TEST_CATEGORY, incr_14_MDrangeReduce) {
   TestMDRangeReduce<TEST_EXECSPACE> test;
   test.reduce_MDRange();
+// FIXME_OPENMPTARGET: custom reductions are not yet supported in the
+// OpenMPTarget backend.
+#if !defined(KOKKOS_ENABLE_OPENMPTARGET)
   test.reduce_custom();
+#endif
 }
 
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp
index 018855963d35f8fef81a93985811dcc3d9b239fc..d145d69d9e0feb4450bfff5080e9955115b5c49e 100644
--- a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp
+++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp
@@ -52,13 +52,16 @@ namespace Test {
 // Test whether allocations survive Kokkos initialize/finalize if done via Raw
 // SYCL.
 TEST(sycl, raw_sycl_interop) {
+  Kokkos::InitArguments arguments{-1, -1, -1, false};
+  Kokkos::initialize(arguments);
+
+  Kokkos::Experimental::SYCL default_space;
+  sycl::context default_context = default_space.sycl_context();
+
   sycl::default_selector device_selector;
-  sycl::queue queue(device_selector);
+  sycl::queue queue(default_context, device_selector);
   constexpr int n = 100;
   int* p          = sycl::malloc_device<int>(n, queue);
-
-  Kokkos::InitArguments arguments{-1, -1, -1, false};
-  Kokkos::initialize(arguments);
   {
     TEST_EXECSPACE space(queue);
     Kokkos::View<int*, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v(p, n);
diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_Spaces.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_Spaces.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..91fdaac6e097fb9b127816301ca2e5c514a4f374
--- /dev/null
+++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_Spaces.cpp
@@ -0,0 +1,356 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <TestSYCL_Category.hpp>
+
+namespace Test {
+
+TEST(sycl, space_access) {
+  static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
+                                                Kokkos::HostSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::assignable,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::accessible,
+                "");
+
+  //--------------------------------------
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLDeviceUSMSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLDeviceUSMSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLDeviceUSMSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLDeviceUSMSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
+                "");
+
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                                       Kokkos::HostSpace>::assignable,
+      "");
+
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                                       Kokkos::HostSpace>::accessible,
+      "");
+
+  //--------------------------------------
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLSharedUSMSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLSharedUSMSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLSharedUSMSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
+                "");
+
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace,
+                                       Kokkos::HostSpace>::assignable,
+      "");
+
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace,
+                                       Kokkos::HostSpace>::accessible,
+      "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLSharedUSMSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLSharedUSMSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
+                "");
+
+  //--------------------------------------
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLHostUSMSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::assignable,
+                "");
+
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
+                                       Kokkos::HostSpace>::assignable,
+      "");
+
+  static_assert(
+      Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
+                                      Kokkos::HostSpace>::accessible,
+      "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLHostUSMSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLHostUSMSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLHostUSMSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLHostUSMSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::accessible,
+                "");
+
+  //--------------------------------------
+
+  static_assert(!Kokkos::SpaceAccessibility<Kokkos::Experimental::SYCL,
+                                            Kokkos::HostSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Experimental::SYCL,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Experimental::SYCL,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Experimental::SYCL,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
+                "");
+
+  static_assert(!Kokkos::SpaceAccessibility<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
+                "");
+
+  static_assert(
+      std::is_same<Kokkos::Impl::HostMirror<
+                       Kokkos::Experimental::SYCLDeviceUSMSpace>::Space,
+                   Kokkos::HostSpace>::value,
+      "");
+
+  static_assert(
+      std::is_same<
+          Kokkos::Impl::HostMirror<
+              Kokkos::Experimental::SYCLSharedUSMSpace>::Space,
+          Kokkos::Device<Kokkos::HostSpace::execution_space,
+                         Kokkos::Experimental::SYCLSharedUSMSpace>>::value,
+      "");
+
+  static_assert(
+      Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
+                                      Kokkos::HostSpace>::accessible,
+      "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
+                "");
+
+  static_assert(std::is_same<Kokkos::Impl::HostMirror<
+                                 Kokkos::Experimental::SYCLHostUSMSpace>::Space,
+                             Kokkos::Experimental::SYCLHostUSMSpace>::value,
+                "");
+
+  static_assert(
+      std::is_same<
+          Kokkos::Device<Kokkos::HostSpace::execution_space,
+                         Kokkos::Experimental::SYCLSharedUSMSpace>,
+          Kokkos::Device<Kokkos::HostSpace::execution_space,
+                         Kokkos::Experimental::SYCLSharedUSMSpace>>::value,
+      "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Impl::HostMirror<Kokkos::Experimental::SYCL>::Space,
+                    Kokkos::HostSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Impl::HostMirror<
+                        Kokkos::Experimental::SYCLDeviceUSMSpace>::Space,
+                    Kokkos::HostSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Impl::HostMirror<
+                        Kokkos::Experimental::SYCLSharedUSMSpace>::Space,
+                    Kokkos::HostSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Impl::HostMirror<
+                        Kokkos::Experimental::SYCLHostUSMSpace>::Space,
+                    Kokkos::HostSpace>::accessible,
+                "");
+}
+
+TEST(sycl, uvm) {
+  int *uvm_ptr = static_cast<int *>(
+      Kokkos::kokkos_malloc<Kokkos::Experimental::SYCLSharedUSMSpace>(
+          "uvm_ptr", sizeof(int)));
+
+  *uvm_ptr = 42;
+
+  Kokkos::Experimental::SYCL().fence();
+  Kokkos::parallel_for(
+      Kokkos::RangePolicy<Kokkos::Experimental::SYCL>(0, 1),
+      KOKKOS_LAMBDA(int) {
+        if (*uvm_ptr == 42) {
+          *uvm_ptr = 2 * 42;
+        }
+      });
+  Kokkos::Experimental::SYCL().fence();
+
+  EXPECT_EQ(*uvm_ptr, int(2 * 42));
+
+  Kokkos::kokkos_free<Kokkos::Experimental::SYCLSharedUSMSpace>(uvm_ptr);
+}
+
+template <class MemSpace, class ExecSpace>
+struct TestViewSYCLAccessible {
+  enum { N = 1000 };
+
+  using V = Kokkos::View<double *, MemSpace>;
+
+  V m_base;
+
+  struct TagInit {};
+  struct TagTest {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TagInit &, const int i) const { m_base[i] = i + 1; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TagTest &, const int i, long &error_count) const {
+    if (m_base[i] != i + 1) ++error_count;
+  }
+
+  TestViewSYCLAccessible() : m_base("base", N) {}
+
+  static void run() {
+    TestViewSYCLAccessible self;
+    Kokkos::parallel_for(
+        Kokkos::RangePolicy<typename MemSpace::execution_space, TagInit>(0, N),
+        self);
+    typename MemSpace::execution_space().fence();
+
+    // Next access is a different execution space, must complete prior kernel.
+    long error_count = -1;
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace, TagTest>(0, N), self,
+                            error_count);
+    EXPECT_EQ(error_count, 0);
+  }
+};
+
+TEST(sycl, impl_view_accessible) {
+  TestViewSYCLAccessible<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                         Kokkos::Experimental::SYCL>::run();
+
+  TestViewSYCLAccessible<Kokkos::Experimental::SYCLSharedUSMSpace,
+                         Kokkos::Experimental::SYCL>::run();
+  TestViewSYCLAccessible<Kokkos::Experimental::SYCLSharedUSMSpace,
+                         Kokkos::HostSpace::execution_space>::run();
+
+  TestViewSYCLAccessible<Kokkos::Experimental::SYCLHostUSMSpace,
+                         Kokkos::Experimental::SYCL>::run();
+  TestViewSYCLAccessible<Kokkos::Experimental::SYCLHostUSMSpace,
+                         Kokkos::HostSpace::execution_space>::run();
+}
+
+}  // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_SharedAlloc.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp
similarity index 96%
rename from packages/kokkos/core/unit_test/cuda/TestCudaUVM_SharedAlloc.cpp
rename to packages/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp
index 6602d7396a7c2fdec7e16e83079764962dbeab75..95a7b68088c1238672b2257d285d7329d52cbec7 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_SharedAlloc.cpp
+++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp
@@ -1,3 +1,4 @@
+
 /*
 //@HEADER
 // ************************************************************************
@@ -42,5 +43,5 @@
 //@HEADER
 */
 
-#include <TestCudaUVM_Category.hpp>
-#include <TestSharedAlloc.hpp>
+#include <TestSYCL_Category.hpp>
+#include <TestTaskScheduler.hpp>
diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab0d09880f03b56d81ae693d26b5c838b2436a24
--- /dev/null
+++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp
@@ -0,0 +1,154 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <TestSYCL_Category.hpp>
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+
+namespace Impl {
+
+struct SYCLQueueScratchTestFunctor {
+  using team_t = Kokkos::TeamPolicy<Kokkos::Experimental::SYCL>::member_type;
+  using scratch_t =
+      Kokkos::View<int64_t*, Kokkos::Experimental::SYCL::scratch_memory_space>;
+
+  Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace,
+               Kokkos::MemoryTraits<Kokkos::Atomic>>
+      counter;
+  int N, M;
+  SYCLQueueScratchTestFunctor(
+      Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace> counter_,
+      int N_, int M_)
+      : counter(counter_), N(N_), M(M_) {}
+
+  KOKKOS_FUNCTION
+  void operator()(const team_t& team) const {
+    scratch_t scr(team.team_scratch(1), M);
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, M),
+                         [&](int i) { scr[i] = 0; });
+    team.team_barrier();
+    for (int i = 0; i < N; i++) {
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, M),
+                           [&](int j) { scr[j] += 1; });
+    }
+    team.team_barrier();
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, M), [&](int i) {
+      if (scr[i] != N) counter()++;
+    });
+  }
+};
+
+void sycl_queue_scratch_test_one(
+    int N, int T, int M_base,
+    Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace> counter,
+    Kokkos::Experimental::SYCL sycl, int tid) {
+  int M = M_base + tid * 5;
+  Kokkos::TeamPolicy<Kokkos::Experimental::SYCL> p(sycl, T, 64);
+  using scratch_t =
+      Kokkos::View<int64_t*, Kokkos::Experimental::SYCL::scratch_memory_space>;
+
+  int bytes = scratch_t::shmem_size(M);
+
+  for (int r = 0; r < 15; r++) {
+    Kokkos::parallel_for("Run", p.set_scratch_size(1, Kokkos::PerTeam(bytes)),
+                         SYCLQueueScratchTestFunctor(counter, N, M));
+  }
+}
+
+void sycl_queue_scratch_test(
+    int N, int T, int M_base,
+    Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace> counter) {
+  constexpr int K = 4;
+  Kokkos::Experimental::SYCL default_space;
+  sycl::context default_context = default_space.sycl_context();
+
+  sycl::default_selector device_selector;
+  sycl::queue queue(default_context, device_selector);
+
+  std::array<Kokkos::Experimental::SYCL, K> sycl;
+  for (int i = 0; i < K; i++) {
+    sycl[i] = Kokkos::Experimental::SYCL(
+        sycl::queue(default_context, device_selector));
+  }
+
+  // Test that growing scratch size in subsequent calls doesn't crash things
+#if defined(KOKKOS_ENABLE_OPENMP)
+#pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    // Limit how many threads submit
+    if (tid < 4) {
+      sycl_queue_scratch_test_one(N, T, M_base, counter, sycl[tid], tid);
+    }
+  }
+#else
+  for (int tid = 0; tid < K; tid++) {
+    sycl_queue_scratch_test_one(N, T, M_base, counter, sycl[tid], tid);
+  }
+#endif
+  // Test that if everything is large enough, multiple launches with different
+  // scratch sizes don't step on each other
+  for (int tid = K - 1; tid >= 0; tid--) {
+    sycl_queue_scratch_test_one(N, T, M_base, counter, sycl[tid], tid);
+  }
+
+  Kokkos::fence();
+}
+}  // namespace Impl
+
+TEST(sycl, team_scratch_1_queues) {
+  int N      = 1000000;
+  int T      = 10;
+  int M_base = 150;
+
+  Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace> counter("C");
+
+  Impl::sycl_queue_scratch_test(N, T, M_base, counter);
+
+  int64_t result;
+  Kokkos::deep_copy(result, counter);
+  ASSERT_EQ(0, result);
+}
+}  // namespace Test
diff --git a/packages/kokkos/core/unit_test/tools/TestBuiltinTuners.cpp b/packages/kokkos/core/unit_test/tools/TestBuiltinTuners.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..870621c1e0d3e4530573fc70ee24208b3b5a7911
--- /dev/null
+++ b/packages/kokkos/core/unit_test/tools/TestBuiltinTuners.cpp
@@ -0,0 +1,123 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#include <Kokkos_Core.hpp>
+using ExecSpace  = Kokkos::DefaultHostExecutionSpace;
+using TeamMember = Kokkos::TeamPolicy<ExecSpace>::member_type;
+struct TestTeamFunctor {
+  KOKKOS_FUNCTION void operator()(TeamMember) const {}
+};
+struct TestMDFunctor {
+  KOKKOS_FUNCTION void operator()(const int, const int) const {}
+};
+int main(int argc, char* argv[]) {
+  Kokkos::initialize(argc, argv);
+  {
+    Kokkos::TeamPolicy<ExecSpace> teamp(1, Kokkos::AUTO, Kokkos::AUTO);
+    Kokkos::MDRangePolicy<Kokkos::Rank<2>> mdp({0, 0}, {1, 1});
+    Kokkos::Tools::Experimental::TeamSizeTuner team_tune_this(
+        "team_tuner", teamp, TestTeamFunctor{}, Kokkos::ParallelForTag{},
+        Kokkos::Tools::Impl::Impl::SimpleTeamSizeCalculator{});
+
+    Kokkos::Tools::Experimental::MDRangeTuner<2> md_tune_this(
+        "md_tuner", mdp, TestMDFunctor{}, Kokkos::ParallelForTag{},
+        Kokkos::Tools::Impl::Impl::SimpleTeamSizeCalculator{});
+
+    std::vector<int> options{1, 2, 3, 4, 5};
+
+    auto new_team_tuner = team_tune_this.combine("options", options);
+    auto new_md_tuner   = md_tune_this.combine("options", options);
+    using namespace Kokkos::Tools::Experimental;
+    VariableInfo info;
+    info.category      = StatisticalCategory::kokkos_value_categorical;
+    info.valueQuantity = CandidateValueType::kokkos_value_unbounded;
+    info.type          = ValueType::kokkos_value_string;
+    size_t input       = declare_input_type("kernel", info);
+    VariableValue team_kernel_value = make_variable_value(input, "abs");
+    VariableValue md_kernel_value   = make_variable_value(input, "abs");
+    size_t kernel_context           = get_new_context_id();
+    begin_context(kernel_context);
+    set_input_values(kernel_context, 1, &team_kernel_value);
+    for (int x = 0; x < 10000; ++x) {
+      auto config = new_md_tuner.begin();
+      int option  = std::get<0>(config);
+      (void)option;
+      int tile_x = std::get<1>(config);
+      int tile_y = std::get<2>(config);
+      Kokkos::parallel_for("mdrange",
+                           Kokkos::MDRangePolicy<Kokkos::Rank<2>>(
+                               {0, 0}, {1, 1}, {tile_x, tile_y}),
+                           TestMDFunctor{});
+      new_md_tuner.end();
+    }
+    end_context(kernel_context);
+    begin_context(kernel_context);
+    set_input_values(kernel_context, 1, &md_kernel_value);
+
+    /**
+     * Note that 0.0 is basically a floating point index into
+     * the outermost index in this, which is the options vector
+     * above. The At 0.0, this will be the first element (1).
+     * At 0.9 this will be the last element (5)
+     */
+    auto begin_point = new_team_tuner.get_point(0.0, 0.0, 0.0);
+    assert(std::get<0>(begin_point) == 1);
+    (void)begin_point;  // to avoid warnings in some compilers
+    auto end_point = new_team_tuner.get_point(0.9, 0.0, 0.0);
+    (void)end_point;  // to avoid warnings in some compilers
+    assert(std::get<0>(end_point) == 5);
+    for (int x = 0; x < 10000; ++x) {
+      auto config = new_team_tuner.begin();
+      int option  = std::get<0>(config);
+      (void)option;
+      int team   = std::get<1>(config);
+      int vector = std::get<2>(config);
+      Kokkos::parallel_for("mdrange",
+                           Kokkos::TeamPolicy<ExecSpace>(1, team, vector),
+                           TestTeamFunctor{});
+      new_team_tuner.end();
+    }
+    end_context(kernel_context);
+  }
+  Kokkos::finalize();
+}
diff --git a/packages/kokkos/core/unit_test/tools/TestCategoricalTuner.cpp b/packages/kokkos/core/unit_test/tools/TestCategoricalTuner.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2177556d392f002ad17499ad08fe93268a3c7937
--- /dev/null
+++ b/packages/kokkos/core/unit_test/tools/TestCategoricalTuner.cpp
@@ -0,0 +1,86 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+// This file tests the categorical tuner
+
+#include <Kokkos_Core.hpp>
+#include <unistd.h>
+struct point {
+  float x;
+  float y;
+  float z;
+};
+void do_computation(const point& test_point) {
+  usleep(((unsigned int)test_point.x) * 100);
+}
+using namespace Kokkos::Tools::Experimental;
+int main(int argc, char* argv[]) {
+  Kokkos::initialize(argc, argv);
+  {
+    VariableInfo info;
+    info.category              = StatisticalCategory::kokkos_value_categorical;
+    info.valueQuantity         = CandidateValueType::kokkos_value_unbounded;
+    info.type                  = ValueType::kokkos_value_string;
+    size_t input               = declare_input_type("kernel", info);
+    VariableValue kernel_value = make_variable_value(input, "abs");
+    size_t kernel_context      = get_new_context_id();
+    begin_context(kernel_context);
+    set_input_values(kernel_context, 1, &kernel_value);
+
+    std::vector<point> points;
+    points.push_back({1.0, 1.0, 1.0});
+    points.push_back({10.0, 10.0, 10.0});
+    points.push_back({0.0, 0.0, 0.0});
+    auto tuner =
+        Kokkos::Tools::Experimental::make_categorical_tuner("points", points);
+    for (decltype(points)::size_type x = 0; x < 3000; ++x) {
+      point test_point = tuner.begin();
+      do_computation(test_point);
+      tuner.end();
+    }
+
+    end_context(kernel_context);
+  }
+  Kokkos::finalize();
+}
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_SharedAlloc.cpp b/packages/kokkos/core/unit_test/tools/TestEventCorrectness.cpp
similarity index 94%
rename from packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_SharedAlloc.cpp
rename to packages/kokkos/core/unit_test/tools/TestEventCorrectness.cpp
index 4228b5181a0ccd68dfde87f71f92fd0a471a8e96..ac0b4d26196351c6654c9b7996931784e4fa2653 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_SharedAlloc.cpp
+++ b/packages/kokkos/core/unit_test/tools/TestEventCorrectness.cpp
@@ -42,5 +42,8 @@
 //@HEADER
 */
 
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestSharedAlloc.hpp>
+#include <iostream>
+#include "Kokkos_Core.hpp"
+
+#include <tools/TestEventCorrectness.hpp>
+#include "../UnitTestMainInit.cpp"
diff --git a/packages/kokkos/core/unit_test/tools/TestEventCorrectness.hpp b/packages/kokkos/core/unit_test/tools/TestEventCorrectness.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..430677a335df32737a08520467cd26513f2e83e7
--- /dev/null
+++ b/packages/kokkos/core/unit_test/tools/TestEventCorrectness.hpp
@@ -0,0 +1,284 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#include <iostream>
+#include <gtest/gtest.h>
+#include "Kokkos_Core.hpp"
+
+#include <impl/Kokkos_Stacktrace.hpp>
+#include <vector>
+#include <algorithm>
+namespace Kokkos {
+class Serial;
+class OpenMP;
+class Cuda;
+class Threads;
+namespace Experimental {
+class SYCL;
+class HIP;
+class OpenMPTarget;
+class HPX;
+}  // namespace Experimental
+}  // namespace Kokkos
+namespace Test {
+struct FencePayload {
+  std::string name;
+  enum distinguishable_devices { yes, no };
+  distinguishable_devices distinguishable;
+  uint32_t dev_id;
+};
+
+std::vector<FencePayload> found_payloads;
+template <typename Lambda>
+void expect_fence_events(std::vector<FencePayload>& expected, Lambda lam) {
+  found_payloads = {};
+  Kokkos::Tools::Experimental::set_begin_fence_callback(
+      [](const char* name, const uint32_t dev_id, uint64_t*) {
+        found_payloads.push_back(
+            FencePayload{std::string(name),
+                         FencePayload::distinguishable_devices::no, dev_id});
+      });
+  Kokkos::Tools::Experimental::set_begin_parallel_for_callback(
+      [](const char* name, const uint32_t dev_id, uint64_t*) {
+        found_payloads.push_back(
+            FencePayload{std::string(name),
+                         FencePayload::distinguishable_devices::no, dev_id});
+      });
+  lam();
+  for (auto& entry : expected) {
+    std::cout << "Ref: " << entry.dev_id << std::endl;
+    std::cout << "Ref: " << entry.name << std::endl;
+    auto search = std::find_if(
+        found_payloads.begin(), found_payloads.end(),
+        [&](const auto& found_entry) {
+          auto name_match =
+              (found_entry.name.find(entry.name) != std::string::npos);
+          auto id_match = (entry.dev_id == found_entry.dev_id);
+          std::cout << found_entry.dev_id << std::endl;
+          std::cout << found_entry.name << std::endl;
+          if (!name_match) {
+            std::cout << "Miss on name\n";
+          }
+          if (!id_match) {
+            std::cout << "Miss on id\n";
+          }
+          return (name_match && id_match);
+        });
+    auto found = (search != found_payloads.end());
+    ASSERT_TRUE(found);
+  }
+  Kokkos::Tools::Experimental::set_begin_fence_callback(
+      [](const char*, const uint32_t, uint64_t*) {});
+  Kokkos::Tools::Experimental::set_begin_parallel_for_callback(
+      [](const char*, const uint32_t, uint64_t*) {});
+}
+
+template <class>
+struct increment {
+  constexpr static const int size = 0;
+};
+int num_instances = 1;
+struct TestFunctor {
+  KOKKOS_FUNCTION void operator()(const int) const {}
+};
+template <typename Lambda>
+void test_wrapper(const Lambda& lambda) {
+  if (!std::is_same<Kokkos::DefaultExecutionSpace, Kokkos::Serial>::value) {
+    lambda();
+  }
+}
+/**
+ * Test that fencing an instance with a name yields a fence
+ * event of that name, and the correct device ID
+ */
+TEST(defaultdevicetype, test_named_instance_fence) {
+  test_wrapper([&]() {
+    auto root = Kokkos::Tools::Experimental::device_id_root<
+        Kokkos::DefaultExecutionSpace>();
+    std::vector<FencePayload> expected{
+
+        {"named_instance", FencePayload::distinguishable_devices::no,
+         root + num_instances}};
+    expect_fence_events(expected, [=]() {
+      Kokkos::DefaultExecutionSpace ex;
+      ex.fence("named_instance");
+    });
+    num_instances += increment<Kokkos::DefaultExecutionSpace>::size;
+  });
+}
+/**
+ * Test that fencing an instance without a name yields a fence
+ * event of a correct name, and the correct device ID
+ */
+TEST(defaultdevicetype, test_unnamed_instance_fence) {
+  test_wrapper([&]() {
+    auto root = Kokkos::Tools::Experimental::device_id_root<
+        Kokkos::DefaultExecutionSpace>();
+    std::vector<FencePayload> expected{
+
+        {"Unnamed Instance Fence", FencePayload::distinguishable_devices::no,
+         root + num_instances}};
+    expect_fence_events(expected, [=]() {
+      Kokkos::DefaultExecutionSpace ex;
+      ex.fence();
+    });
+    num_instances += increment<Kokkos::DefaultExecutionSpace>::size;
+  });
+}
+
+/**
+ * Test that invoking a global fence with a name yields a fence
+ * event of a correct name, and fences the root of the default device
+ */
+TEST(defaultdevicetype, test_named_global_fence) {
+  test_wrapper([&]() {
+    auto root = Kokkos::Tools::Experimental::device_id_root<
+        Kokkos::DefaultExecutionSpace>();
+
+    std::vector<FencePayload> expected{
+
+        {"test global fence", FencePayload::distinguishable_devices::no, root}};
+    expect_fence_events(expected,
+                        [=]() { Kokkos::fence("test global fence"); });
+  });
+}
+
+/**
+ * Test that invoking a global fence with no name yields a fence
+ * event of a correct name, and fences the root of the default device
+ */
+TEST(defaultdevicetype, test_unnamed_global_fence) {
+  test_wrapper([&]() {
+    auto root = Kokkos::Tools::Experimental::device_id_root<
+        Kokkos::DefaultExecutionSpace>();
+
+    std::vector<FencePayload> expected{
+
+        {"Unnamed Global Fence", FencePayload::distinguishable_devices::no,
+         root}};
+    expect_fence_events(expected, [=]() { Kokkos::fence(); });
+    num_instances += increment<Kokkos::DefaultExecutionSpace>::size;
+  });
+}
+/**
+ * Test that creating two default instances and fencing both yields
+ * fence on the same device ID, as these should yield the same instance
+ */
+TEST(defaultdevicetype, test_multiple_default_instances) {
+  test_wrapper([&]() {
+    std::vector<FencePayload> expected{};
+    expect_fence_events(expected, [=]() {
+      Kokkos::DefaultExecutionSpace ex1;
+      Kokkos::DefaultExecutionSpace ex2;
+      ex1.fence("named_instance_fence_one");
+      ex2.fence("named_instance_fence_two");
+    });
+    ASSERT_TRUE(found_payloads[0].dev_id == found_payloads[1].dev_id);
+  });
+}
+
+/**
+ * Test that fencing and kernels yield events on the correct device ID's
+ */
+TEST(defaultdevicetype, test_kernel_sequence) {
+  test_wrapper([&]() {
+    auto root = Kokkos::Tools::Experimental::device_id_root<
+        Kokkos::DefaultExecutionSpace>();
+    std::vector<FencePayload> expected{
+
+        {"named_instance", FencePayload::distinguishable_devices::no,
+         root + num_instances},
+        {"test_kernel", FencePayload::distinguishable_devices::no,
+         root + num_instances}
+
+    };
+    expect_fence_events(expected, [=]() {
+      Kokkos::DefaultExecutionSpace ex;
+      TestFunctor tf;
+      ex.fence("named_instance");
+      Kokkos::parallel_for(
+          "test_kernel",
+          Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(ex, 0, 1), tf);
+    });
+    num_instances += increment<Kokkos::DefaultExecutionSpace>::size;
+  });
+}
+#ifdef KOKKOS_ENABLE_CUDA
+/**
+ * CUDA ONLY: test that creating instances from streams leads to events
+ * on different device ID's
+ */
+TEST(defaultdevicetype, test_streams) {
+  test_wrapper([&]() {
+    // auto root = Kokkos::Tools::Experimental::device_id_root<
+    //    Kokkos::DefaultExecutionSpace>();
+    std::vector<FencePayload> expected{};
+    expect_fence_events(expected, [=]() {
+      cudaStream_t s1, s2;
+      cudaStreamCreate(&s1);
+      cudaStreamCreate(&s2);
+      Kokkos::Cuda default_space;
+      Kokkos::Cuda space_s1(s1);
+      Kokkos::Cuda space_s2(s2);
+      default_space.fence();
+      space_s1.fence();
+      space_s2.fence();
+    });
+    num_instances += increment<Kokkos::DefaultExecutionSpace>::size;
+    found_payloads.erase(
+        std::remove_if(found_payloads.begin(), found_payloads.end(),
+                       [&](const auto& entry) {
+                         return (
+                             entry.name.find("Fence on space initialization") !=
+                             std::string::npos);
+                       }),
+        found_payloads.end());
+    ASSERT_TRUE(found_payloads[0].dev_id != found_payloads[1].dev_id);
+    ASSERT_TRUE(found_payloads[2].dev_id != found_payloads[1].dev_id);
+    ASSERT_TRUE(found_payloads[2].dev_id != found_payloads[0].dev_id);
+  });
+}
+
+#endif
+
+}  // namespace Test
diff --git a/packages/kokkos/example/query_device/query_device.cpp b/packages/kokkos/example/query_device/query_device.cpp
index a563b06b2864d5d0e855a80b836f3ef70f33f3a1..9c4e9a8c835938c0b301fa1927a2cb5d08e654c1 100644
--- a/packages/kokkos/example/query_device/query_device.cpp
+++ b/packages/kokkos/example/query_device/query_device.cpp
@@ -47,7 +47,8 @@
 
 #include <Kokkos_Macros.hpp>
 
-#if defined(KOKKOS_ENABLE_MPI)
+//#define USE_MPI
+#if defined(USE_MPI)
 #include <mpi.h>
 #endif
 
@@ -61,7 +62,7 @@ int main(int argc, char** argv) {
 
   (void)argc;
   (void)argv;
-#if defined(KOKKOS_ENABLE_MPI)
+#if defined(USE_MPI)
 
   MPI_Init(&argc, &argv);
 
@@ -72,7 +73,7 @@ int main(int argc, char** argv) {
   msg << "MPI rank(" << mpi_rank << ") ";
 
 #endif
-
+  Kokkos::initialize(argc, argv);
   msg << "{" << std::endl;
 
   if (Kokkos::hwloc::available()) {
@@ -82,15 +83,13 @@ int main(int argc, char** argv) {
         << std::endl;
   }
 
-#if defined(KOKKOS_ENABLE_CUDA)
-  Kokkos::Cuda::print_configuration(msg);
-#endif
+  Kokkos::print_configuration(msg);
 
   msg << "}" << std::endl;
 
   std::cout << msg.str();
-
-#if defined(KOKKOS_ENABLE_MPI)
+  Kokkos::finalize();
+#if defined(USE_MPI)
 
   MPI_Finalize();
 
diff --git a/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp b/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp
index 07b99087d4c310e6cf0d82c026f52bd610dd0ecb..5ac7f4fbb060ae952a0685313ec357ffa05abf96 100644
--- a/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp
+++ b/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp
@@ -107,8 +107,8 @@ int main(int argc, char* argv[]) {
 
   // ViewType aliases for Rank<2>, Rank<3> for example usage
   using ScalarType  = double;
-  using ViewType_2D = typename Kokkos::View<ScalarType**>;
-  using ViewType_3D = typename Kokkos::View<ScalarType***>;
+  using ViewType_2D = Kokkos::View<ScalarType**>;
+  using ViewType_3D = Kokkos::View<ScalarType***>;
 
   /////////////////////////////////////////////////////////////////////////////
   // Explanation of MDRangePolicy usage, template parameters, constructor
@@ -160,8 +160,7 @@ int main(int argc, char* argv[]) {
   long incorrect_count_2d = 0;
   {
     // Rank<2> Case: Rank is provided, all other parameters are default
-    using MDPolicyType_2D = typename Kokkos::Experimental::MDRangePolicy<
-        Kokkos::Experimental::Rank<2> >;
+    using MDPolicyType_2D = Kokkos::MDRangePolicy<Kokkos::Rank<2> >;
 
     // Construct 2D MDRangePolicy: lower and upper bounds provided, tile dims
     // defaulted
@@ -185,9 +184,8 @@ int main(int argc, char* argv[]) {
   long incorrect_count_3d = 0;
   {
     // Rank<3> Case: Rank, inner iterate pattern, outer iterate pattern provided
-    using MDPolicyType_3D = typename Kokkos::Experimental::MDRangePolicy<
-        Kokkos::Experimental::Rank<3, Kokkos::Experimental::Iterate::Left,
-                                   Kokkos::Experimental::Iterate::Left> >;
+    using MDPolicyType_3D = Kokkos::MDRangePolicy<
+        Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left> >;
 
     // Construct 3D MDRangePolicy: lower, upper bounds, tile dims provided
     MDPolicyType_3D mdpolicy_3d({{0, 0, 0}}, {{n, n, n}}, {{4, 4, 4}});
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp b/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp
index 597d1e3056ece9ef5865a3fb79dfef09ccf50a6a..75eca5403fd12ae09f2839ef696fafefa9f8f277 100644
--- a/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp
+++ b/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdio>
 
 // These two View types are both 2-D arrays of double.  However, they
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp b/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp
index 00bfeea36b972e6ea08ab8c82ec5aaca1a4e2af5..0544e572e7e9785369bfc824db783ea2fcd5af53 100644
--- a/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp
+++ b/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdio>
 #include <cstdlib>
 
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp b/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp
index 20e5c5a284f415e7627fd07df20ffbe5856f3428..52af4bd3b5ba84b3b5c1b53111900a9104e41922 100644
--- a/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp
+++ b/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp
@@ -49,7 +49,7 @@
 // the mesh.
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdio>
 
 using mesh_type = Kokkos::View<double***, Kokkos::LayoutRight>;
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp b/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp
index 3c0fcd085c7c2afe29a328dfa3f574ab9ac81276..622b24b93131094ebd3c331d4c4f01ae14cca325 100644
--- a/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp
+++ b/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp
@@ -44,7 +44,7 @@
 
 #include <Kokkos_Core.hpp>
 #include <Kokkos_DualView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdio>
 #include <cstdlib>
 
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp b/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp
index a906ba1447283f3a5b2517e1f6c21839b458b597..596b25aaade065c9ade57f90107e17d6fda3d06a 100644
--- a/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp
+++ b/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp
@@ -44,7 +44,7 @@
 
 #include <Kokkos_Core.hpp>
 #include <Kokkos_DualView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdio>
 #include <cstdlib>
 
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp b/packages/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp
index c582fa17043629bd65b253e6afabd76134f1817b..c03515479d0d7e0365272f87cbe49d11b21f13aa 100644
--- a/packages/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp
+++ b/packages/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp
@@ -46,7 +46,7 @@
 #include <cstdio>
 #include <typeinfo>
 #include <cmath>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 struct FillDevice {
   double value;
diff --git a/packages/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp b/packages/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp
index 9c5f2d62fc58b86cbdd723e3328cf0ba1e38df27..602122b61f1b94788b95052478a6123ae41fbfd9 100644
--- a/packages/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp
+++ b/packages/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp
@@ -45,7 +45,7 @@
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Random.hpp>
 #include <Kokkos_DualView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdlib>
 
 using DefaultHostType = Kokkos::HostSpace::execution_space;
@@ -74,7 +74,7 @@ using DefaultHostType = Kokkos::HostSpace::execution_space;
 template <class GeneratorPool>
 struct generate_random {
   // Output View for the random numbers
-  Kokkos::View<uint64_t*> vals;
+  Kokkos::View<uint64_t**> vals;
 
   // The GeneratorPool
   GeneratorPool rand_pool;
@@ -82,7 +82,7 @@ struct generate_random {
   int samples;
 
   // Initialize all members
-  generate_random(Kokkos::View<uint64_t*> vals_, GeneratorPool rand_pool_,
+  generate_random(Kokkos::View<uint64_t**> vals_, GeneratorPool rand_pool_,
                   int samples_)
       : vals(vals_), rand_pool(rand_pool_), samples(samples_) {}
 
@@ -94,8 +94,7 @@ struct generate_random {
     // Draw samples numbers from the pool as urand64 between 0 and
     // rand_pool.MAX_URAND64 Note there are function calls to get other type of
     // scalars, and also to specify Ranges or get a normal distributed float.
-    for (int k = 0; k < samples; k++)
-      vals(i * samples + k) = rand_gen.urand64();
+    for (int k = 0; k < samples; k++) vals(i, k) = rand_gen.urand64();
 
     // Give the state back, which will allow another thread to acquire it
     rand_pool.free_state(rand_gen);
@@ -103,11 +102,11 @@ struct generate_random {
 };
 
 int main(int argc, char* args[]) {
+  Kokkos::initialize(argc, args);
   if (argc != 3) {
     printf("Please pass two integers on the command line\n");
   } else {
     // Initialize Kokkos
-    Kokkos::initialize(argc, args);
     int size    = std::stoi(args[1]);
     int samples = std::stoi(args[2]);
 
@@ -117,7 +116,7 @@ int main(int argc, char* args[]) {
     // pool.
     Kokkos::Random_XorShift64_Pool<> rand_pool64(5374857);
     Kokkos::Random_XorShift1024_Pool<> rand_pool1024(5374857);
-    Kokkos::DualView<uint64_t*> vals("Vals", size * samples);
+    Kokkos::DualView<uint64_t**> vals("Vals", size, samples);
 
     // Run some performance comparisons
     Kokkos::Timer timer;
@@ -151,8 +150,7 @@ int main(int argc, char* args[]) {
            1.0e-9 * samples * size / time_1024);
 
     Kokkos::deep_copy(vals.h_view, vals.d_view);
-
-    Kokkos::finalize();
   }
+  Kokkos::finalize();
   return 0;
 }
diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp
index d36010892597bbcc9d1be710cae06574e7410ba7..cc20a497b2325825cf5faf01e2fd527e71efee53 100644
--- a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp
+++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp
@@ -44,7 +44,7 @@
 
 #include <Kokkos_Core.hpp>
 #include <Kokkos_DualView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdio>
 #include <cstdlib>
 
diff --git a/packages/kokkos/generate_makefile.bash b/packages/kokkos/generate_makefile.bash
index c601e0ee161fb11833c9f97014205585839a3717..5e33f592183b9da2a7f079a09feeab6943bceebf 100755
--- a/packages/kokkos/generate_makefile.bash
+++ b/packages/kokkos/generate_makefile.bash
@@ -162,6 +162,7 @@ display_help_text() {
       echo "                 VEGA900         = AMD GPU MI25 GFX900"
       echo "                 VEGA906         = AMD GPU MI50/MI60 GFX906"
       echo "                 VEGA908         = AMD GPU MI100 GFX908"
+      echo "                 VEGA90A         = "
       echo "               [ARM]"
       echo "                 ARMV80          = ARMv8.0 Compatible CPU"
       echo "                 ARMV81          = ARMv8.1 Compatible CPU"
@@ -478,5 +479,5 @@ if [[ ${COMPILER} == *clang* ]]; then
    fi
 fi
 
-echo cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${KOKKOS_PATH}
-cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${PASSTHRU_CMAKE_FLAGS} ${KOKKOS_PATH}
+echo cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF ${KOKKOS_PATH}
+cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${PASSTHRU_CMAKE_FLAGS} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF ${KOKKOS_PATH}
diff --git a/packages/kokkos/master_history.txt b/packages/kokkos/master_history.txt
index be8a5e7da5f4d8cada30fea4f78a21656268c8ef..69cd133b44b013145a3a4cfa4bb8c0124ef68d73 100644
--- a/packages/kokkos/master_history.txt
+++ b/packages/kokkos/master_history.txt
@@ -25,3 +25,4 @@ tag:  3.3.00     date: 12:16:2020    master: 734f577a    release: 1535ba5c
 tag:  3.3.01     date: 01:06:2021    master: 6d65b5a3    release: 4d23839c
 tag:  3.4.00     date: 04:26:2021    master: 1fb0c284    release: 5d7738d6
 tag:  3.4.01     date: 05:20:2021    master: 4b97a22f    release: 410b15c8
+tag:  3.5.00     date: 10:28:2021    master: c28a8b03    release: ddad6256
diff --git a/packages/kokkos/scripts/docker/Dockerfile.clang b/packages/kokkos/scripts/docker/Dockerfile.clang
index 6aaf75fae55ff975df5045bb73a0813236871d89..92999a8a44a54c22a40e717a5858a9d0dc5b7199 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.clang
+++ b/packages/kokkos/scripts/docker/Dockerfile.clang
@@ -9,16 +9,22 @@ RUN apt-get update && apt-get install -y \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
+RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \
+    KEYDUMP_FILE=keydump && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \
+    gpg --import ${KEYDUMP_FILE} && \
+    gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \
+    rm ${KEYDUMP_FILE}*
+
 ARG CMAKE_VERSION=3.16.8
 ENV CMAKE_DIR=/opt/cmake
-RUN CMAKE_KEY=2D2CEF1034921684 && \
-    CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
+RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
     CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \
     CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \
-    gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \
     gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \
     grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \
     mkdir -p ${CMAKE_DIR} && \
@@ -28,13 +34,11 @@ ENV PATH=${CMAKE_DIR}/bin:$PATH
 
 ENV LLVM_DIR=/opt/llvm
 RUN LLVM_VERSION=8.0.0 && \
-    LLVM_KEY=345AD05D && \
     LLVM_URL=http://releases.llvm.org/${LLVM_VERSION}/clang+llvm-${LLVM_VERSION}-x86_64-linux-gnu-ubuntu-16.04.tar.xz && \
     LLVM_ARCHIVE=llvm-${LLVM_VERSION}.tar.xz && \
     SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \
     wget --quiet ${LLVM_URL} --output-document=${LLVM_ARCHIVE} && \
     wget --quiet ${LLVM_URL}.sig --output-document=${LLVM_ARCHIVE}.sig && \
-    gpg --keyserver pool.sks-keyservers.net --recv-keys ${LLVM_KEY} && \
     gpg --verify ${LLVM_ARCHIVE}.sig ${LLVM_ARCHIVE} && \
     mkdir -p ${LLVM_DIR} && \
     tar -xvf ${LLVM_ARCHIVE} -C ${LLVM_DIR} --strip-components=1 && \
diff --git a/packages/kokkos/scripts/docker/Dockerfile.gcc b/packages/kokkos/scripts/docker/Dockerfile.gcc
index 56972d3185d0f62e6b9effb64e8f2cedefe25c66..51d50e64063b611a79a86c8bea159c6435bdc492 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.gcc
+++ b/packages/kokkos/scripts/docker/Dockerfile.gcc
@@ -1,15 +1,21 @@
 FROM gcc:5.3.0
 
+RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \
+    KEYDUMP_FILE=keydump && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \
+    gpg --import ${KEYDUMP_FILE} && \
+    gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \
+    rm ${KEYDUMP_FILE}*
+
 ARG CMAKE_VERSION=3.16.8
 ENV CMAKE_DIR=/opt/cmake
-RUN CMAKE_KEY=2D2CEF1034921684 && \
-    CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
+RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
     CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \
     CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \
-    gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \
     gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \
     grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \
     mkdir -p ${CMAKE_DIR} && \
diff --git a/packages/kokkos/scripts/docker/Dockerfile.hipcc b/packages/kokkos/scripts/docker/Dockerfile.hipcc
index d3b6b93a023396aa785703a5aeec0c4001af34e8..5bef7f2ef814ad7420b8c0d4bdefa0961f4dd211 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.hipcc
+++ b/packages/kokkos/scripts/docker/Dockerfile.hipcc
@@ -1,8 +1,7 @@
-ARG BASE=rocm/dev-ubuntu-20.04:3.8
+ARG BASE=rocm/dev-ubuntu-20.04:4.2
 FROM $BASE
 
 RUN apt-get update && apt-get install -y \
-        git \
         kmod \
         wget \
         ccache \
@@ -13,16 +12,22 @@ RUN apt-get update && apt-get install -y \
 
 ENV PATH=/opt/rocm/bin:$PATH
 
+RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \
+    KEYDUMP_FILE=keydump && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \
+    gpg --import ${KEYDUMP_FILE} && \
+    gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \
+    rm ${KEYDUMP_FILE}*
+
 ARG CMAKE_VERSION=3.16.8
 ENV CMAKE_DIR=/opt/cmake
-RUN CMAKE_KEY=2D2CEF1034921684 && \
-    CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
+RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
     CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \
     CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \
-    gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \
     gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \
     grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \
     mkdir -p ${CMAKE_DIR} && \
diff --git a/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject b/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject
index 5d53a645e4bc7c551698719d3edb1c3768467ca7..3de9a7f5804f938d3c4056c723f9a25ceb189242 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject
+++ b/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject
@@ -11,16 +11,22 @@ RUN apt-get update && apt-get install -y \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
+RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \
+    KEYDUMP_FILE=keydump && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \
+    gpg --import ${KEYDUMP_FILE} && \
+    gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \
+    rm ${KEYDUMP_FILE}*
+
 ARG CMAKE_VERSION=3.16.8
 ENV CMAKE_DIR=/opt/cmake
-RUN CMAKE_KEY=2D2CEF1034921684 && \
-    CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
+RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
     CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \
     CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \
-    gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \
     gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \
     grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \
     mkdir -p ${CMAKE_DIR} && \
diff --git a/packages/kokkos/scripts/docker/Dockerfile.nvcc b/packages/kokkos/scripts/docker/Dockerfile.nvcc
index e17accc0663980694821b8002b976277fcd9ca42..8a054066bde8e1983c9b9baf511836c88eabefa5 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.nvcc
+++ b/packages/kokkos/scripts/docker/Dockerfile.nvcc
@@ -5,7 +5,6 @@ ARG ADDITIONAL_PACKAGES
 
 RUN apt-get update && apt-get install -y \
         bc \
-        git \
         wget \
         ccache \
         $ADDITIONAL_PACKAGES \
@@ -13,16 +12,22 @@ RUN apt-get update && apt-get install -y \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
+RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \
+    KEYDUMP_FILE=keydump && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \
+    gpg --import ${KEYDUMP_FILE} && \
+    gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \
+    rm ${KEYDUMP_FILE}*
+
 ARG CMAKE_VERSION=3.16.8
 ENV CMAKE_DIR=/opt/cmake
-RUN CMAKE_KEY=2D2CEF1034921684 && \
-    CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
+RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
     CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \
     CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \
-    gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \
     gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \
     grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \
     mkdir -p ${CMAKE_DIR} && \
diff --git a/packages/kokkos/scripts/docker/Dockerfile.openmptarget b/packages/kokkos/scripts/docker/Dockerfile.openmptarget
index b6efcb82cae1a8da1cf82e050bf4ad7b8a7870e4..5a676ca32a484dec5ee6e89ca3b4acf21acbbd81 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.openmptarget
+++ b/packages/kokkos/scripts/docker/Dockerfile.openmptarget
@@ -14,16 +14,22 @@ RUN apt-get update && apt-get install -y \
 
 ARG NPROC=8
 
+RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \
+    KEYDUMP_FILE=keydump && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \
+    gpg --import ${KEYDUMP_FILE} && \
+    gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \
+    rm ${KEYDUMP_FILE}*
+
 ARG CMAKE_VERSION=3.18.5
 ENV CMAKE_DIR=/opt/cmake
-RUN CMAKE_KEY=2D2CEF1034921684 && \
-    CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
+RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
     CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \
     CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \
-    gpg --keyserver hkps.pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \
     gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \
     grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \
     mkdir -p ${CMAKE_DIR} && \
diff --git a/packages/kokkos/scripts/docker/Dockerfile.sycl b/packages/kokkos/scripts/docker/Dockerfile.sycl
index fdcd6d01fb8e3158000aa1507bb5bfcf7e0d9b4e..3393d0da8a7f257f71d79eb5dfd9f76b5bfd6a31 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.sycl
+++ b/packages/kokkos/scripts/docker/Dockerfile.sycl
@@ -3,25 +3,31 @@ FROM $BASE
 
 RUN apt-get update && apt-get install -y \
         bc \
-        git \
         wget \
         ccache \
         ninja-build \
         python3 \
+        git \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
+RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \
+    KEYDUMP_FILE=keydump && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \
+    gpg --import ${KEYDUMP_FILE} && \
+    gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \
+    rm ${KEYDUMP_FILE}*
+
 ARG CMAKE_VERSION=3.18.5
 ENV CMAKE_DIR=/opt/cmake
-RUN CMAKE_KEY=2D2CEF1034921684 && \
-    CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
+RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
     CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \
     CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \
-    gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \
     gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \
     grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \
     mkdir -p ${CMAKE_DIR} && \
@@ -30,7 +36,7 @@ RUN CMAKE_KEY=2D2CEF1034921684 && \
 ENV PATH=${CMAKE_DIR}/bin:$PATH
 
 ENV SYCL_DIR=/opt/sycl
-RUN SYCL_VERSION=20210311 && \
+RUN SYCL_VERSION=20210621 && \
     SYCL_URL=https://github.com/intel/llvm/archive/sycl-nightly && \
     SYCL_ARCHIVE=${SYCL_VERSION}.tar.gz && \
     SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \
diff --git a/packages/kokkos/scripts/testing_scripts/test_all_sandia b/packages/kokkos/scripts/testing_scripts/test_all_sandia
index 877b35b73e1aef7c64cdb2d7e5f00f7bc235781c..3e0295643e48b8af85a8aa39874545f7340b157b 100755
--- a/packages/kokkos/scripts/testing_scripts/test_all_sandia
+++ b/packages/kokkos/scripts/testing_scripts/test_all_sandia
@@ -108,6 +108,10 @@ if [[ "$HOSTNAME" == cn* ]]; then # Warning: very generic name
   MACHINE=mayer
 fi
 
+if [[ "$HOSTNAME" == caraway* ]]; then
+  MACHINE=caraway
+fi
+
 if [[ "$HOSTNAME" == kokkos-dev\.sandia\.gov* ]]; then
   MACHINE=kokkos-dev
 fi
@@ -302,6 +306,7 @@ if [ "$MACHINE" = "sems" ]; then
                "clang/5.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
                "clang/7.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
                "clang/9.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/10.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
                "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "intel/18.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "intel/19.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
@@ -430,8 +435,8 @@ elif [ "$MACHINE" = "weaver" ]; then
 
   BASE_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>"
   IBM_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/xl/<COMPILER_VERSION>,gcc/7.2.0"
-  CUDA_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.2.0,ibm/xl/16.1.1"
-  CUDA10_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.4.0,ibm/xl/16.1.1"
+  CUDA_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>,ibm/xl/16.1.1,gcc/7.2.0"
+  CUDA10_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>,ibm/xl/16.1.1,gcc/7.4.0"
 
   # Don't do pthread with Power
   GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
@@ -494,6 +499,23 @@ elif [ "$MACHINE" = "mayer" ]; then
     ARCH_FLAG="--arch=ARMV8_THUNDERX2"
   fi
 
+elif [ "$MACHINE" = "caraway" ]; then
+  SKIP_HWLOC=True
+
+  BASE_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>"
+
+  HIPCLANG_BUILD_LIST="Hip_Serial,Hip_OpenMP"
+  HIPCLANG_WARNING_FLAGS="-Werror -Wno-unused-command-line-argument -DNDEBUG"
+
+  # Format: (compiler module-list build-list exe-name warning-flag)
+  COMPILERS=("rocm/4.2.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS"
+             "rocm/4.3.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS"
+  )
+
+  if [ -z "$ARCH_FLAG" ]; then
+    ARCH_FLAG="--arch=VEGA906"
+  fi
+
 elif [ "$MACHINE" = "blake" ]; then
   source /etc/profile.d/modules.sh
   SKIP_HWLOC=True
@@ -597,8 +619,9 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then
 
   BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>"
   GCC91_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>"
-  NVCC9_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0"
+  NVCC9_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0"
   NVCC_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.3.0"
+  NVCC_SEMSMODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.3.0"
   NVCC11_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/9.2.0"
 
   CLANG8_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/10.0"
@@ -620,13 +643,16 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then
   else
     # Format: (compiler module-list build-list exe-name warning-flag)
     COMPILERS=("cuda/10.0 $NVCC_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-               "cuda/10.1 $NVCC_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/10.1 $NVCC_SEMSMODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/11.0 $NVCC11_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/11.1 $NVCC_SEMSMODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/11.2 $NVCC11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/9.2 $NVCC9_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "clang/8.0 $CLANG8_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
                "clang/8.0 $CLANG8_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS"
                "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/9.1 $GCC91_MODULE_LIST "$GCC_BUILD_LIST" g++ $GCC_WARNING_FLAGS"
diff --git a/packages/kokkos/scripts/testing_scripts/update_lib.sh b/packages/kokkos/scripts/testing_scripts/update_lib.sh
index 34ab5dd3c9a0afae4b10b70d99772308f35b3f9f..ee2f66dc407a76dd7f70c6c14b789fd42584fb11 100755
--- a/packages/kokkos/scripts/testing_scripts/update_lib.sh
+++ b/packages/kokkos/scripts/testing_scripts/update_lib.sh
@@ -20,7 +20,7 @@ check_sems_clang() {
   CLANGVER=$(clang --version | grep "clang version" | cut -d " " -f 3)
   if [[ "${CLANGVER}" = 9.* ]] || [[ "${CLANGVER}" = 10.* ]]; then
     # Newer gcc needed for c++ standard beyond c++14
-    module swap sems-gcc/5.3.0 sems-gcc/6.4.0
+    module swap sems-gcc/5.3.0 sems-gcc/8.3.0
     module list
   fi
 }