diff --git a/packages/CLI11/.appveyor.yml b/packages/CLI11/.appveyor.yml
index 18915a1748725d232d7c75bf36f912e46f2244da..59bb8326b6a00dcb870df92f866cd9f50f210d8e 100644
--- a/packages/CLI11/.appveyor.yml
+++ b/packages/CLI11/.appveyor.yml
@@ -1,8 +1,8 @@
-version: 2.1.1.{build}
+version: 2.1.2.{build}
 
 branches:
   only:
-    - master
+    - main
     - v1
 
 install:
diff --git a/packages/CLI11/.github/CONTRIBUTING.md b/packages/CLI11/.github/CONTRIBUTING.md
index 330d8b15b9636b092c3fb37a9c93daa4da7fa810..2f44c9da7d1a6d849dc94d81aa24200c86fd6442 100644
--- a/packages/CLI11/.github/CONTRIBUTING.md
+++ b/packages/CLI11/.github/CONTRIBUTING.md
@@ -76,9 +76,8 @@ Steps:
 
 * Update changelog if needed
 * Update the version in `.appveyor.yml` and `include/CLI/Version.hpp`.
-* Find and replace in README:
+* Find and replace in README (new minor/major release only):
   * Replace " 🆕" and "🆕 " with "" (ignores the description line)
   * Check for `\/\/$` (vi syntax) to catch leftover `// 🆕`
   * Replace "🚧" with "🆕" (manually ignore the description line)
 * Make a release in the GitHub UI, use a name such as "Version X.Y(.Z): Title"
-* Currently, the release action wipes the title after you release, so remember to edit the title back to the original name after the `CLI11.hpp` file gets uploaded.
diff --git a/packages/CLI11/.github/workflows/build.yml b/packages/CLI11/.github/workflows/build.yml
index 11ed11ca405d775ba9517c60cdef9d9de77f60ea..a6f250409d9d2736f19a6adf26b84a9507e400ca 100644
--- a/packages/CLI11/.github/workflows/build.yml
+++ b/packages/CLI11/.github/workflows/build.yml
@@ -2,13 +2,11 @@ name: Build
 on:
   push:
     branches:
-      - master
+      - main
       - v*
     tags:
       - "*"
   pull_request:
-    branches:
-      - master
 
 jobs:
   single-header:
diff --git a/packages/CLI11/.github/workflows/tests.yml b/packages/CLI11/.github/workflows/tests.yml
index 020ba3b2b94d03c2e3e7519a5f2877620d542385..a48b4e367526e2d34ba8aa30353d7df784c65aef 100644
--- a/packages/CLI11/.github/workflows/tests.yml
+++ b/packages/CLI11/.github/workflows/tests.yml
@@ -2,21 +2,11 @@ name: Tests
 on:
   push:
     branches:
-      - master
+      - main
       - v*
   pull_request:
-    branches:
-      - master
 
 jobs:
-  pre-commit:
-    name: Formatting
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v2
-    - uses: actions/setup-python@v2
-    - uses: pre-commit/action@v2.0.3
-
   cuda-build:
     name: CUDA build only
     runs-on: ubuntu-latest
@@ -27,13 +17,34 @@ jobs:
         submodules: true
     - name: Add wget
       run: apt-get update && apt-get install -y wget
-    - name: Setup cmake
+    - name: Get cmake
       uses: jwlawson/actions-setup-cmake@v1.11
     - name: Configure
       run: cmake -S . -B build -DCLI11_CUDA_TESTS=ON
     - name: Build
       run: cmake --build build -j2
 
+
+  boost-build:
+    name: Boost build
+    runs-on: ubuntu-latest
+    container: zouzias/boost:1.76.0
+    steps:
+    - uses: actions/checkout@v1
+      with:
+        submodules: true
+    - name: Add deps
+      run: apt-get update && apt-get install make
+    - name: Get CMake
+      uses: jwlawson/actions-setup-cmake@v1.11
+    - name: Configure
+      run: cmake -S . -B build -DCLI11_BOOST=ON
+    - name: Build
+      run: cmake --build build -j2
+    - name: Run tests
+      run: ctest --output-on-failure
+      working-directory: build
+
   cmake-config:
     name: CMake config check
     runs-on: ubuntu-latest
@@ -148,3 +159,10 @@ jobs:
         cmake-version: "3.21"
         args: -DCLI11_SANITIZERS=ON -DCLI11_BUILD_EXAMPLES_JSON=ON
       if: success() || failure()
+
+    - name: Check CMake 3.22 (full)
+      uses: ./.github/actions/quick_cmake
+      with:
+        cmake-version: "3.22"
+        args: -DCLI11_SANITIZERS=ON -DCLI11_BUILD_EXAMPLES_JSON=ON
+      if: success() || failure()
diff --git a/packages/CLI11/.gitrepo b/packages/CLI11/.gitrepo
index 6e7179c3001b3661e634d9725debd699b8d62d92..9e19e3887de346a0389ceac098b60a6dd619ed04 100644
--- a/packages/CLI11/.gitrepo
+++ b/packages/CLI11/.gitrepo
@@ -5,8 +5,8 @@
 ;
 [subrepo]
 	remote = git@github.com:CLIUtils/CLI11.git
-	branch = master
-	commit = b440890eaf29d526e13997f67c2e0288c7c3c60f
-	parent = 4bd033645f2c3c03bdf1682e34ec57bbd2b5dd21
-	cmdver = 0.4.3
+	branch = main
+	commit = 70f8072f9dd2292fd0b9f9e5f58e279f60483ed3
+	parent = ae4cfe8875caf314c7f66eec2e6d09d5ee321e6a
 	method = merge
+	cmdver = 0.4.3
diff --git a/packages/CLI11/.pre-commit-config.yaml b/packages/CLI11/.pre-commit-config.yaml
index 00d77d22fa0a21d7e121628ad9adaaa111a9c717..84135427c6baa312b9911497c4b687a9bcfcacf9 100644
--- a/packages/CLI11/.pre-commit-config.yaml
+++ b/packages/CLI11/.pre-commit-config.yaml
@@ -1,8 +1,6 @@
 ci:
   autoupdate_commit_msg: "chore(deps): pre-commit.ci autoupdate"
   autofix_commit_msg: "style: pre-commit.ci fixes"
-  skip:
-    - docker-clang-format
 
 
 repos:
@@ -23,17 +21,10 @@ repos:
   - id: mixed-line-ending
   - id: trailing-whitespace
 
-- repo: local
+- repo: https://github.com/pre-commit/mirrors-clang-format
+  rev: v13.0.0
   hooks:
-  - id: docker-clang-format
-    name: Docker Clang Format
-    language: docker_image
-    types:
-    - c++
-    entry: unibeautify/clang-format:latest
-    args:
-    - -style=file
-    - -i
+   - id: clang-format
 
 - repo: https://github.com/cheshirekow/cmake-format-precommit
   rev: v0.6.13
diff --git a/packages/CLI11/.travis.yml b/packages/CLI11/.travis.yml
deleted file mode 100644
index d352aed96d47a9169595813dca8fa8ff2899a69e..0000000000000000000000000000000000000000
--- a/packages/CLI11/.travis.yml
+++ /dev/null
@@ -1,122 +0,0 @@
-language: cpp
-dist: trusty
-
-# Exclude ghpages,
-# but even better, don't build branch and PR, just PR
-# Include tags starting with v and a digit
-branches:
-  only:
-  - master
-  - /^v\d/
-
-cache:
-  apt: true
-  directories:
-  - "${TRAVIS_BUILD_DIR}/deps/doxygen"
-
-matrix:
-  include:
-    # Default clang
-  - compiler: clang
-    script:
-    - .ci/make_and_test.sh 11
-    - .ci/make_and_test.sh 14
-    - .ci/make_and_test.sh 17
-
-    # Docs and clang 3.5
-  - compiler: clang
-    language: node_js
-    node_js: "7.4.0"
-    env:
-    - DEPLOY_MAT=yes
-    addons:
-      apt:
-        packages:
-        - clang-3.5
-    install:
-    - export CC=clang-3.5
-    - export CXX=clang++-3.5
-    - npm install gitbook-cli -g
-    - gitbook fetch 3.2.3
-    - gitbook install book
-    script:
-    - .ci/make_and_test.sh 11
-    after_success:
-    - export DEPS_DIR="${TRAVIS_BUILD_DIR}/deps"
-    - . .ci/build_doxygen.sh
-    - doxygen docs/Doxyfile
-    - gitbook build book html/book
-
-    # GCC 7 and coverage (8 does not support lcov, wait till 9 and new lcov)
-  - compiler: gcc
-    dist: bionic
-    addons:
-      apt:
-        packages:
-        - curl
-        - lcov
-    install:
-    - DEPS_DIR="${TRAVIS_BUILD_DIR}/deps"
-    - cd $TRAVIS_BUILD_DIR
-    - ". .ci/build_lcov.sh"
-    - ".ci/run_codecov.sh"
-    script:
-    - .ci/make_and_test.sh 11 -DCLI11_EXAMPLE_JSON=ON
-    - .ci/make_and_test.sh 14 -DCLI11_EXAMPLE_JSON=ON
-    - .ci/make_and_test.sh 17 -DCLI11_EXAMPLE_JSON=ON
-
-    # GCC 4.8 and Conan
-  - compiler: gcc
-    dist: bionic
-    addons:
-      apt:
-        packages:
-        - python3-pip
-        - python3-setuptools
-    install:
-    - python3 -VV
-    - python3 -m pip install --user conan
-    - conan user
-    script:
-    - .ci/make_and_test.sh 11
-    after_success:
-    - conan create . cliutils/stable
-    - |
-      if [ "${TRAVIS_TAG}" ]
-      then
-        conan remote add origin https://api.bintray.com/conan/cliutils/CLI11
-        conan user -p ${BINFROG_API_KEY} -r origin henryiii
-        conan upload "*" -c -r origin --all
-      fi
-
-
-install: skip
-
-script:
-- .ci/make_and_test.sh 11
-- .ci/make_and_test.sh 14
-
-
-deploy:
-- provider: pages
-  skip_cleanup: true
-  github_token: ${GH_REPO_TOKEN}
-  keep_history: false
-  local_dir: ${TRAVIS_BUILD_DIR}/html
-  on:
-    branch: master
-    condition: "$DEPLOY_MAT = yes"
-
-notifications:
-  webhooks:
-    urls:
-    - https://webhooks.gitter.im/e/bbdb3befce4c00448d24
-    on_success: change
-    on_failure: always
-    on_start: never
-
-env:
-  global:
-  - secure: cY0OI609iTAxLRYuYQnNMi+H6n0dBwioTAoFXGGRTnngw2V9om3UmY5eUu4HQEQsQZovHdYpNhlSgRmdwQ4UqSp3FGyrwobf0kzacV4bVnMDeXDmHt8RzE5wP/LwDd8elNF6RRYjElY99f0k0FyXVd0fIvuVkGKQECNLOtEk0jQo+4YTh7dhuCxRhBYgTbNiRL6UJynfrcK0YN+DQ+8CJNupu2VxgaEpCSngTfvDHLcddcrXwpvn3MPc3FsDUbtN389ZCIe41qqIL0ATv46DQaTw4FOevyVfRyrBOznONoGCVeAYKL6VBdrk01Fh6aytF5zgI3hKaKobgEn+QFfzR6l68c6APvqA0Qv39iLjuh6KbdIV2YsqXfyt6FBgqP2xZuNEZW1jZ8LxUOLl2I40UEh87nFutvnSbfIzN+FcLrajm2H2jV2kZGNKAMx+4qxkZuXSre4JPkENfJm2WNFAKlqPt4ZSEQarkDYzZPcEr2I9fbGjQYVJICoN4LikCv9K5z7ujpTxCTNbVpQWZcEOT6QQBc6Vml/N/NKAIl9o2OeTLiXCmT31+KQMeO492KYNQ6VmkeqrVhGExOUcJdNyDJV9C+3mSekb3Sq78SneYRKDechkWbMl0ol07wGTdBwQQwgaorjRyn07x1rDxpPr3z19/+eubnpPUW4UQ5MYsjs=
-  - secure: G6H5HA9pPUgsd96A+uvTxbLjR1rcT9NtxsknIkFDfzGDpffn6wVX+kCIQLf9zFDnQnsfYA/4piiuoBN5U5C7HQrh9UCvBVptXjWviea0Y7CRbMJZpw2rPvXWQtrFNzYkaV7kdJ5B0Mmvh6rcH/I8gKFrkdjF7i7sfzWdFWRU5QXfxXOk2n+xCXX6uFemxHH9850XEjVtnU7YYUebQFaoTYLLy05nlt9JaEF84wfJljY/SJX7I9gpNLtizE9MpJylnrwUeL66OqFievmjL3/bWpPUBjUF0WdtXYlVDja7O582FQDs94ofgqeGieGIMQ0VuovpbQOJSdjs5XHZwu2ce6HZxtOhJJqw6xEwbq43ZdofAlJ5GUEOgrr+j25zIDkdzOhliDKJtw5ysmmTUKEcZ36iWbCE0YP/IC42yOV9oOP6UkgbuwpVDdxAFRgLZLahW9Ok+c1PlzIauPxv+jIEI4rSEEJRKZG2JK3TXUdhd58mHBfQMNjKQMF+Y2wCCGjfMO0q4SgvBhYyb4oBTxEqnc2Pzh2DJdNzRFsV7ktsQSRglHGVI+1XTmQ+2kbBzNOQBLjOuRvDZENUhyxPKGZDHyAOMlVvYm8vvWebM1/F3YgDb/tPh33+EGSvpKkCZ5nUxB5e605H6gdYlNKNhuWKlEKTo2/kF0D39gAUCIcGbzw=
-  - CCACHE_CPP2: yes
diff --git a/packages/CLI11/CHANGELOG.md b/packages/CLI11/CHANGELOG.md
index 41c193814d3868ffe9895583a44205751485981d..e35ca38676833f3891fbbd8ea8f0ed8a0808588c 100644
--- a/packages/CLI11/CHANGELOG.md
+++ b/packages/CLI11/CHANGELOG.md
@@ -11,7 +11,7 @@ is not passed, or every time the option is parsed.
 
 * Option/subcommand name restrictions have been relaxed. Most characters are now allowed. [#627][]
 * The config parser can accept streams, specify a specific section, and inline comment characters are supported [#630][]
-* `force_callback` & `trigger_on_parse` added, allowing a callback to always run on parse even if not present or every time the option is parsed[#631][]
+* `force_callback` & `trigger_on_parse` added, allowing a callback to always run on parse even if not present or every time the option is parsed [#631][]
 * Bugfix(cmake): Only add `CONFIGURE_DEPENDS` if CLI11 is the main project [#633][]
 * Bugfix(cmake): Ensure the cmake/pkg-config files install to a arch independent path [#635][]
 * Bugfix: The single header file generation was missing the include guard. [#620][]
@@ -26,8 +26,24 @@ is not passed, or every time the option is parsed.
 ### Version 2.1.1: Quick Windows fix
 
 * A collision with `min`/`max` macros on Windows has been fixed. [#642][]
+* Tests pass with Boost again [#646][]
+* Running the pre-commit hooks in development no longer requires docker for clang-format [#647][]
 
 [#642]: https://github.com/CLIUtils/CLI11/pull/642
+[#646]: https://github.com/CLIUtils/CLI11/pull/646
+[#647]: https://github.com/CLIUtils/CLI11/pull/647
+
+## Version 2.1.2: Better subproject builds
+
+* Use `main` for the main branch of the repository [#657][]
+* Bugfix(cmake): Enforce at least C++11 when using CMake target [#656][]
+* Build: Don't run doxygen and CTest includes if a submodule [#656][]
+* Build: Avoid a warning on CMake 3.22 [#656][]
+* Build: Support compiling the tests with an external copy of Catch2 [#653][]
+
+[#653]: https://github.com/CLIUtils/CLI11/pull/653
+[#656]: https://github.com/CLIUtils/CLI11/pull/656
+[#657]: https://github.com/CLIUtils/CLI11/pull/657
 
 ## Version 2.0: Simplification
 
@@ -610,7 +626,7 @@ This release focused on cleaning up the most exotic compiler warnings, fixing a
 
 ## Version 0.8: CLIUtils
 
-This release moved the repository to the CLIUtils master organization.
+This release moved the repository to the CLIUtils main organization.
 
 * Moved to CLIUtils on GitHub
 * Fixed docs build and a few links
@@ -645,7 +661,7 @@ Lots of cleanup and docs additions made it into this release. Parsing is simpler
 * `->ignore_case()` added to subcommands, options, and `add_set_ignore_case`. Subcommands inherit setting from parent App on creation.
 * Subcommands now can be "chained", that is, left over arguments can now include subcommands that then get parsed. Subcommands are now a list (`get_subcommands`). Added `got_subcommand(App_or_name)` to check for subcommands.
 * Added `.allow_extras()` to disable error on failure. Parse returns a vector of leftover options. Renamed error to `ExtrasError`, and now triggers on extra options too.
-* Added `require_subcommand` to `App`, to simplify forcing subcommands. Do **not** do `add_subcommand()->require_subcommand`, since that is the subcommand, not the master `App`.
+* Added `require_subcommand` to `App`, to simplify forcing subcommands. Do **not** do `add_subcommand()->require_subcommand`, since that is the subcommand, not the main `App`.
 * Added printout of ini file text given parsed options, skips flags.
 * Support for quotes and spaces in ini files
 * Fixes to allow support for Windows (added Appveyor) (Uses `-`, not `/` syntax)
diff --git a/packages/CLI11/CMakeLists.txt b/packages/CLI11/CMakeLists.txt
index 3227e409d9e1d982f4ffd3a58ae9e99d297663b4..1f4313ff59743a81a75ce1f5e15d3afa29c7c5ed 100644
--- a/packages/CLI11/CMakeLists.txt
+++ b/packages/CLI11/CMakeLists.txt
@@ -2,14 +2,14 @@ cmake_minimum_required(VERSION 3.4)
 # Note: this is a header only library. If you have an older CMake than 3.4,
 # just add the CLI11/include directory and that's all you need to do.
 
-# Make sure users don't get warnings on a tested (3.4 to 3.21) version
+# Make sure users don't get warnings on a tested (3.4 to 3.22) version
 # of CMake. For most of the policies, the new version is better (hence the change).
 # We don't use the 3.4...3.21 syntax because of a bug in an older MSVC's
 # built-in and modified CMake 3.11
-if(${CMAKE_VERSION} VERSION_LESS 3.21)
+if(${CMAKE_VERSION} VERSION_LESS 3.22)
   cmake_policy(VERSION ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION})
 else()
-  cmake_policy(VERSION 3.21)
+  cmake_policy(VERSION 3.22)
 endif()
 
 set(VERSION_REGEX "#define CLI11_VERSION[ \t]+\"(.+)\"")
@@ -30,18 +30,29 @@ project(
 # Print the version number of CMake if this is the main project
 if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
   message(STATUS "CMake ${CMAKE_VERSION}")
+
+  find_package(Doxygen)
+
+  if(CMAKE_VERSION VERSION_LESS 3.10)
+    message(STATUS "CMake 3.10+ adds Doxygen support. Update CMake to build documentation")
+  elseif(NOT Doxygen_FOUND)
+    message(STATUS "Doxygen not found, building docs has been disabled")
+  endif()
+
+  include(CTest)
+else()
+  if(NOT DEFINED BUILD_TESTING)
+    set(BUILD_TESTING OFF)
+  endif()
 endif()
 
 include(CMakeDependentOption)
 include(GNUInstallDirs)
-include(CTest)
 
 if(NOT CMAKE_VERSION VERSION_LESS 3.11)
   include(FetchContent)
 endif()
 
-find_package(Doxygen)
-
 list(APPEND force-libcxx "CMAKE_CXX_COMPILER_ID STREQUAL \"Clang\"")
 list(APPEND force-libcxx "CMAKE_SYSTEM_NAME STREQUAL \"Linux\"")
 list(APPEND force-libcxx "CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME")
@@ -118,12 +129,6 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
   set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 endif()
 
-if(CMAKE_VERSION VERSION_LESS 3.10)
-  message(STATUS "CMake 3.10+ adds Doxygen support. Update CMake to build documentation")
-elseif(NOT Doxygen_FOUND)
-  message(STATUS "Doxygen not found, building docs has been disabled")
-endif()
-
 # Special target that adds warnings. Is not exported.
 add_library(CLI11_warnings INTERFACE)
 
@@ -154,6 +159,22 @@ add_library(CLI11::CLI11 ALIAS CLI11) # for add_subdirectory calls
 target_include_directories(CLI11 INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
                                            $<INSTALL_INTERFACE:include>)
 
+if(CMAKE_VERSION VERSION_LESS 3.8)
+  # This might not be a complete list
+  target_compile_features(
+    CLI11
+    INTERFACE cxx_lambdas
+              cxx_nullptr
+              cxx_override
+              cxx_range_for
+              cxx_right_angle_brackets
+              cxx_strong_enums
+              cxx_constexpr
+              cxx_auto_type)
+else()
+  target_compile_features(CLI11 INTERFACE cxx_std_11)
+endif()
+
 # To see in IDE, headers must be listed for target
 set(header-patterns "${PROJECT_SOURCE_DIR}/include/CLI/*")
 if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME AND NOT CMAKE_VERSION VERSION_LESS 3.12)
@@ -254,6 +275,7 @@ if(CLI11_SINGLE_FILE)
 endif()
 
 if(CLI11_BUILD_TESTS)
+  include(CTest)
   add_subdirectory(tests)
 endif()
 
diff --git a/packages/CLI11/README.md b/packages/CLI11/README.md
index 229b65451b0665a5ae4ae8250be454818f025251..ad1ac1e97b63de4b8c4501b74831c149c3633d21 100644
--- a/packages/CLI11/README.md
+++ b/packages/CLI11/README.md
@@ -2,19 +2,20 @@
 
 ![CLI11 Logo](./docs/CLI11_300.png)
 
-[![Build Status Linux and macOS][travis-badge]][travis]
-[![Build Status Windows][appveyor-badge]][appveyor]
 [![Build Status Azure][azure-badge]][azure]
 [![Actions Status][actions-badge]][actions-link]
+[![Build Status AppVeyor][appveyor-badge]][appveyor]
 [![Code Coverage][codecov-badge]][codecov]
 [![Codacy Badge][codacy-badge]][codacy-link]
-[![Gitter chat][gitter-badge]][gitter]
 [![License: BSD][license-badge]](./LICENSE)
-[![Latest release][releases-badge]][github releases]
 [![DOI][doi-badge]][doi-link]
+
+[![Gitter chat][gitter-badge]][gitter]
+[![Latest GHA release][releases-badge]][github releases]
+[![Latest release][repology-badge]][repology]
 [![Conan.io][conan-badge]][conan-link]
 [![Conda Version][conda-badge]][conda-link]
-[![Try CLI11 2.0 online][wandbox-badge]][wandbox-link]
+[![Try CLI11 2.1 online][wandbox-badge]][wandbox-link]
 
 [What's new](./CHANGELOG.md) •
 [Documentation][gitbook] •
@@ -57,14 +58,14 @@ CLI11 is a command line parser for C++11 and beyond that provides a rich feature
 * [Contribute](#contribute)
 * [License](#license)
 
-Features that were added in the last released major version are marked with "🆕". Features only available in master are marked with "🚧".
+Features that were added in the last released major version are marked with "🆕". Features only available in main are marked with "🚧".
 
 ## Background
 
 ### Introduction
 
 CLI11 provides all the features you expect in a powerful command line parser, with a beautiful, minimal syntax and no dependencies beyond C++11. It is header only, and comes in a single file form for easy inclusion in projects. It is easy to use for small projects, but powerful enough for complex command line projects, and can be customized for frameworks.
-It is tested on [Travis][], [AppVeyor][], [Azure][], and [GitHub Actions][actions-link], and is used by the [GooFit GPU fitting framework][goofit]. It was inspired by [`plumbum.cli`][plumbum] for Python. CLI11 has a user friendly introduction in this README, a more in-depth tutorial [GitBook][], as well as [API documentation][api-docs] generated by Travis.
+It is tested on [Azure][] and [GitHub Actions][actions-link], and was originally used by the [GooFit GPU fitting framework][goofit]. It was inspired by [`plumbum.cli`][plumbum] for Python. CLI11 has a user friendly introduction in this README, a more in-depth tutorial [GitBook][], as well as [API documentation][api-docs] generated by Travis.
 See the [changelog](./CHANGELOG.md) or [GitHub Releases][] for details for current and past releases. Also see the [Version 1.0 post][], [Version 1.3 post][], [Version 1.6 post][], or [Version 2.0 post][] for more information.
 
 You can be notified when new releases are made by subscribing to <https://github.com/CLIUtils/CLI11/releases.atom> on an RSS reader, like Feedly, or use the releases mode of the GitHub watching tool.
@@ -77,7 +78,7 @@ An acceptable CLI parser library should be all of the following:
 * Short, simple syntax: This is one of the main reasons to use a CLI parser, it should make variables from the command line nearly as easy to define as any other variables. If most of your program is hidden in CLI parsing, this is a problem for readability.
 * C++11 or better: Should work with GCC 4.8+ (default on CentOS/RHEL 7), Clang 3.4+, AppleClang 7+, NVCC 7.0+, or MSVC 2015+.
 * Work on Linux, macOS, and Windows.
-* Well tested using [Travis][] (Linux) and [AppVeyor][] (Windows) or [Azure][] (all three). "Well" is defined as having good coverage measured by [CodeCov][].
+* Well tested on all common platforms and compilers. "Well" is defined as having good coverage measured by [CodeCov][].
 * Clear help printing.
 * Nice error messages.
 * Standard shell idioms supported naturally, like grouping flags, a positional separator, etc.
@@ -128,7 +129,7 @@ So, this library was designed to provide a great syntax, good compiler compatibi
 There are some other possible "features" that are intentionally not supported by this library:
 
 * Non-standard variations on syntax, like `-long` options. This is non-standard and should be avoided, so that is enforced by this library.
-* Completion of partial options, such as Python's `argparse` supplies for incomplete arguments. It's better not to guess. Most third party command line parsers for python actually reimplement command line parsing rather than using argparse because of this perceived design flaw.
+* Completion of partial options, such as Python's `argparse` supplies for incomplete arguments. It's better not to guess. Most third party command line parsers for python actually reimplement command line parsing rather than using argparse because of this perceived design flaw (recent versions do have an option to disable it).
 * Autocomplete: This might eventually be added to both Plumbum and CLI11, but it is not supported yet.
 * Wide strings / unicode: Since this uses the standard library only, it might be hard to properly implement, but I would be open to suggestions in how to do this.
 
@@ -139,7 +140,7 @@ To use, there are several methods:
 * All-in-one local header: Copy `CLI11.hpp` from the [most recent release][github releases] into your include directory, and you are set. This is combined from the source files  for every release. This includes the entire command parser library, but does not include separate utilities (like `Timer`, `AutoTimer`). The utilities are completely self contained and can be copied separately.
 * All-in-one global header: Like above, but copying the file to a shared folder location like `/opt/CLI11`. Then, the C++ include path has to be extended to point at this folder. With CMake, use `include_directories(/opt/CLI11)`
 * Local headers and target: Use `CLI/*.hpp` files. You could check out the repository as a git submodule, for example. With CMake, you can use `add_subdirectory` and the `CLI11::CLI11` interface target when linking. If not using a submodule, you must ensure that the copied files are located inside the same tree directory than your current project, to prevent an error with CMake and `add_subdirectory`.
-* Global headers: Use `CLI/*.hpp` files stored in a shared folder. You could check out the git repository in a system-wide folder, for example `/opt/`. With CMake, you could add to the include path via:
+* Global headers: Use `CLI/*.hpp` files stored in a shared folder. You could check out the git repository to a system-wide folder, for example `/opt/`. With CMake, you could add to the include path via:
 
 ```bash
 if(NOT DEFINED CLI11_DIR)
@@ -159,15 +160,27 @@ And then in the source code (adding several headers might be needed to prevent l
 * Global headers and target: configuring and installing the project is required for linking CLI11 to your project in the same way as you would do with any other external library. With CMake, this step allows using `find_package(CLI11 CONFIG REQUIRED)` and then using the `CLI11::CLI11` target when linking. If `CMAKE_INSTALL_PREFIX` was changed during install to a specific folder like `/opt/CLI11`, then you have to pass `-DCLI11_DIR=/opt/CLI11` when building your current project. You can also use [Conan.io][conan-link] or [Hunter][].
     (These are just conveniences to allow you to use your favorite method of managing packages; it's just header only so including the correct path and
     using C++11 is all you really need.)
+* Via FetchContent in CMake 3.14+ (or 3.11+ with more work): you can add this with fetch-content, then use the `CLI11::CLI11` target as above, and CMake will download the project in the configure stage:
+
+```cmake
+include(FetchContent)
+FetchContent_Declare(
+  cli11
+  GIT_REPOSITORY https://github.com/CLIUtils/CLI11
+  GIT_TAG        v2.1.2
+)
+
+FetchContent_MakeAvailable(cli11)
+```
+
+It is highly recommended that you use the git hash for `GIT_TAG` instead of a tag or branch, as that will both be more secure, as well as faster to reconfigure - CMake will not have to reach out to the internet to see if the tag moved. You can also download just the single header file from the releases using `file(DOWNLOAD`.
 
 To build the tests, checkout the repository and use CMake:
 
 ```bash
-mkdir build
-cd build
-cmake ..
-make
-GTEST_COLOR=1 CTEST_OUTPUT_ON_FAILURE=1 make test
+cmake -S . -B build
+cmake --build build
+CTEST_OUTPUT_ON_FAILURE=1 cmake --build build -t test
 ```
 
 <details><summary>Note: Special instructions for GCC 8</summary><p>
@@ -592,7 +605,7 @@ There are several options that are supported on the main app and subcommands and
 * `.got_subcommand(App_or_name)`: Check to see if a subcommand was received on the command line.
 * `.get_subcommands(filter)`: The list of subcommands that match a particular filter function.
 * `.add_option_group(name="", description="")`: Add an [option group](#option-groups) to an App,  an option group is specialized subcommand intended for containing groups of options or other groups for controlling how options interact.
-* `.get_parent()`: Get the parent App or `nullptr` if called on master App.
+* `.get_parent()`: Get the parent App or `nullptr` if called on main App.
 * `.get_option(name)`: Get an option pointer by option name will throw if the specified option is not available,  nameless subcommands are also searched
 * `.get_option_no_throw(name)`: Get an option pointer by option name. This function will return a `nullptr` instead of throwing if the option is not available.
 * `.get_options(filter)`: Get the list of all defined option pointers (useful for processing the app for custom output formats).
@@ -604,7 +617,7 @@ There are several options that are supported on the main app and subcommands and
 * `.parsed()`: True if this subcommand was given on the command line.
 * `.count()`: Returns the number of times the subcommand was called.
 * `.count(option_name)`: Returns the number of times a particular option was called.
-* `.count_all()`: Returns the total number of arguments a particular subcommand processed, on the master App it returns the total number of processed commands.
+* `.count_all()`: Returns the total number of arguments a particular subcommand processed, on the main App it returns the total number of processed commands.
 * `.name(name)`: Add or change the name.
 * `.callback(void() function)`: Set the callback for an app. Either sets the `pre_parse_callback` or the `final_callback` depending on the value of `immediate_callback`. See [Subcommand callbacks](#callbacks) for some additional details.
 * `.parse_complete_callback(void() function)`: Set the callback that runs at the completion of parsing. For subcommands this is executed at the completion of the single subcommand and can be executed multiple times. See [Subcommand callbacks](#callbacks) for some additional details.
@@ -673,7 +686,7 @@ The subcommand method
 .add_option_group(name,description)
 ```
 
-Will create an option group, and return a pointer to it. The argument for `description` is optional and can be omitted.  An option group allows creation of a collection of options, similar to the groups function on options, but with additional controls and requirements.  They allow specific sets of options to be composed and controlled as a collective.  For an example see [range example](https://github.com/CLIUtils/CLI11/blob/master/examples/ranges.cpp).  Option groups are a specialization of an App so all [functions](#subcommand-options) that work with an App or subcommand also work on option groups.  Options can be created as part of an option group using the add functions just like a subcommand, or previously created options can be added through.  The name given in an option group must not contain newlines or null characters.🆕
+Will create an option group, and return a pointer to it. The argument for `description` is optional and can be omitted.  An option group allows creation of a collection of options, similar to the groups function on options, but with additional controls and requirements.  They allow specific sets of options to be composed and controlled as a collective.  For an example see [range example](https://github.com/CLIUtils/CLI11/blob/main/examples/ranges.cpp).  Option groups are a specialization of an App so all [functions](#subcommand-options) that work with an App or subcommand also work on option groups.  Options can be created as part of an option group using the add functions just like a subcommand, or previously created options can be added through.  The name given in an option group must not contain newlines or null characters.🆕
 
 ```cpp
 ogroup->add_option(option_pointer);
@@ -892,28 +905,28 @@ The API is [documented here][api-docs]. Also see the [CLI11 tutorial GitBook][gi
 
 Several short examples of different features are included in the repository. A brief description of each is included here
 
-* [callback_passthrough](https://github.com/CLIUtils/CLI11/blob/master/examples/callback_passthrough.cpp): Example of directly passing remaining arguments through to a callback function which generates a CLI11 application based on existing arguments.
-* [custom_parse](https://github.com/CLIUtils/CLI11/blob/master/examples/custom_parse.cpp): Based on [Issue #566](https://github.com/CLIUtils/CLI11/issues/566), example of custom parser
-* [digit_args](https://github.com/CLIUtils/CLI11/blob/master/examples/digit_args.cpp): Based on [Issue #123](https://github.com/CLIUtils/CLI11/issues/123), uses digit flags to pass a value
-* [enum](https://github.com/CLIUtils/CLI11/blob/master/examples/enum.cpp): Using enumerations in an option, and the use of [CheckedTransformer](#transforming-validators)
-* [enum_ostream](https://github.com/CLIUtils/CLI11/blob/master/examples/enum_ostream.cpp): In addition to the contents of example enum.cpp, this example shows how a custom ostream operator overrides CLI11's enum streaming.
-* [formatter](https://github.com/CLIUtils/CLI11/blob/master/examples/formatter.cpp): Illustrating usage of a custom formatter
-* [groups](https://github.com/CLIUtils/CLI11/blob/master/examples/groups.cpp): Example using groups of options for help grouping and a the timer helper class
-* [inter_argument_order](https://github.com/CLIUtils/CLI11/blob/master/examples/inter_argument_order.cpp): An app to practice mixing unlimited arguments, but still recover the original order.
-* [json](https://github.com/CLIUtils/CLI11/blob/master/examples/json.cpp): Using JSON as a config file parser
-* [modhelp](https://github.com/CLIUtils/CLI11/blob/master/examples/modhelp.cpp): How to modify the help flag to do something other than default
-* [nested](https://github.com/CLIUtils/CLI11/blob/master/examples/nested.cpp): Nested subcommands
-* [option_groups](https://github.com/CLIUtils/CLI11/blob/master/examples/option_groups.cpp): Illustrating the use of option groups and a required number of options. Based on [Issue #88](https://github.com/CLIUtils/CLI11/issues/88) to set interacting groups of options
-* [positional_arity](https://github.com/CLIUtils/CLI11/blob/master/examples/positional_arity.cpp): Illustrating use of `preparse_callback` to handle situations where the number of arguments can determine which should get parsed,  Based on [Issue #166](https://github.com/CLIUtils/CLI11/issues/166)
-* [positional_validation](https://github.com/CLIUtils/CLI11/blob/master/examples/positional_validation.cpp): Example of how positional arguments are validated using the `validate_positional` flag, also based on [Issue #166](https://github.com/CLIUtils/CLI11/issues/166)
-* [prefix_command](https://github.com/CLIUtils/CLI11/blob/master/examples/prefix_command.cpp): Illustrating use of the `prefix_command` flag.
-* [ranges](https://github.com/CLIUtils/CLI11/blob/master/examples/ranges.cpp): App to demonstrate exclusionary option groups based on [Issue #88](https://github.com/CLIUtils/CLI11/issues/88)
-* [shapes](https://github.com/CLIUtils/CLI11/blob/master/examples/shapes.cpp): Illustrating how to set up repeated subcommands Based on [gitter discussion](https://gitter.im/CLI11gitter/Lobby?at=5c7af6b965ffa019ea788cd5)
-* [simple](https://github.com/CLIUtils/CLI11/blob/master/examples/simple.cpp): A simple example of how to set up a CLI11 Application with different flags and options
-* [subcom_help](https://github.com/CLIUtils/CLI11/blob/master/examples/subcom_help.cpp): Configuring help for subcommands
-* [subcom_partitioned](https://github.com/CLIUtils/CLI11/blob/master/examples/subcom_partitioned.cpp): Example with a timer and subcommands generated separately and added to the main app later.
-* [subcommands](https://github.com/CLIUtils/CLI11/blob/master/examples/subcommands.cpp): Short example of subcommands
-* [validators](https://github.com/CLIUtils/CLI11/blob/master/examples/validators.cpp): Example illustrating use of validators
+* [callback_passthrough](https://github.com/CLIUtils/CLI11/blob/main/examples/callback_passthrough.cpp): Example of directly passing remaining arguments through to a callback function which generates a CLI11 application based on existing arguments.
+* [custom_parse](https://github.com/CLIUtils/CLI11/blob/main/examples/custom_parse.cpp): Based on [Issue #566](https://github.com/CLIUtils/CLI11/issues/566), example of custom parser
+* [digit_args](https://github.com/CLIUtils/CLI11/blob/main/examples/digit_args.cpp): Based on [Issue #123](https://github.com/CLIUtils/CLI11/issues/123), uses digit flags to pass a value
+* [enum](https://github.com/CLIUtils/CLI11/blob/main/examples/enum.cpp): Using enumerations in an option, and the use of [CheckedTransformer](#transforming-validators)
+* [enum_ostream](https://github.com/CLIUtils/CLI11/blob/main/examples/enum_ostream.cpp): In addition to the contents of example enum.cpp, this example shows how a custom ostream operator overrides CLI11's enum streaming.
+* [formatter](https://github.com/CLIUtils/CLI11/blob/main/examples/formatter.cpp): Illustrating usage of a custom formatter
+* [groups](https://github.com/CLIUtils/CLI11/blob/main/examples/groups.cpp): Example using groups of options for help grouping and a the timer helper class
+* [inter_argument_order](https://github.com/CLIUtils/CLI11/blob/main/examples/inter_argument_order.cpp): An app to practice mixing unlimited arguments, but still recover the original order.
+* [json](https://github.com/CLIUtils/CLI11/blob/main/examples/json.cpp): Using JSON as a config file parser
+* [modhelp](https://github.com/CLIUtils/CLI11/blob/main/examples/modhelp.cpp): How to modify the help flag to do something other than default
+* [nested](https://github.com/CLIUtils/CLI11/blob/main/examples/nested.cpp): Nested subcommands
+* [option_groups](https://github.com/CLIUtils/CLI11/blob/main/examples/option_groups.cpp): Illustrating the use of option groups and a required number of options. Based on [Issue #88](https://github.com/CLIUtils/CLI11/issues/88) to set interacting groups of options
+* [positional_arity](https://github.com/CLIUtils/CLI11/blob/main/examples/positional_arity.cpp): Illustrating use of `preparse_callback` to handle situations where the number of arguments can determine which should get parsed,  Based on [Issue #166](https://github.com/CLIUtils/CLI11/issues/166)
+* [positional_validation](https://github.com/CLIUtils/CLI11/blob/main/examples/positional_validation.cpp): Example of how positional arguments are validated using the `validate_positional` flag, also based on [Issue #166](https://github.com/CLIUtils/CLI11/issues/166)
+* [prefix_command](https://github.com/CLIUtils/CLI11/blob/main/examples/prefix_command.cpp): Illustrating use of the `prefix_command` flag.
+* [ranges](https://github.com/CLIUtils/CLI11/blob/main/examples/ranges.cpp): App to demonstrate exclusionary option groups based on [Issue #88](https://github.com/CLIUtils/CLI11/issues/88)
+* [shapes](https://github.com/CLIUtils/CLI11/blob/main/examples/shapes.cpp): Illustrating how to set up repeated subcommands Based on [gitter discussion](https://gitter.im/CLI11gitter/Lobby?at=5c7af6b965ffa019ea788cd5)
+* [simple](https://github.com/CLIUtils/CLI11/blob/main/examples/simple.cpp): A simple example of how to set up a CLI11 Application with different flags and options
+* [subcom_help](https://github.com/CLIUtils/CLI11/blob/main/examples/subcom_help.cpp): Configuring help for subcommands
+* [subcom_partitioned](https://github.com/CLIUtils/CLI11/blob/main/examples/subcom_partitioned.cpp): Example with a timer and subcommands generated separately and added to the main app later.
+* [subcommands](https://github.com/CLIUtils/CLI11/blob/main/examples/subcommands.cpp): Short example of subcommands
+* [validators](https://github.com/CLIUtils/CLI11/blob/main/examples/validators.cpp): Example illustrating use of validators
 
 ## Contribute
 
@@ -1008,15 +1021,15 @@ CLI11 was developed at the [University of Cincinnati][] to support of the [GooFi
 
 [doi-badge]: https://zenodo.org/badge/80064252.svg
 [doi-link]: https://zenodo.org/badge/latestdoi/80064252
-[azure-badge]: https://dev.azure.com/CLIUtils/CLI11/_apis/build/status/CLIUtils.CLI11?branchName=master
+[azure-badge]: https://dev.azure.com/CLIUtils/CLI11/_apis/build/status/CLIUtils.CLI11?branchName=main
 [azure]: https://dev.azure.com/CLIUtils/CLI11
-[travis-badge]: https://img.shields.io/travis/CLIUtils/CLI11/master.svg?label=Linux/macOS
-[travis]: https://travis-ci.org/CLIUtils/CLI11
-[appveyor-badge]: https://img.shields.io/appveyor/ci/HenrySchreiner/cli11/master.svg?label=AppVeyor
+[actions-link]: https://github.com/CLIUtils/CLI11/actions
+[actions-badge]: https://github.com/CLIUtils/CLI11/actions/workflows/tests.yml/badge.svg
+[appveyor-badge]: https://ci.appveyor.com/api/projects/status/82niaxpaa28dwbms/branch/main?svg=true
 [appveyor]: https://ci.appveyor.com/project/HenrySchreiner/cli11
-[actions-badge]: https://github.com/CLIUtils/CLI11/workflows/Tests/badge.svg
-[actions-link]:  https://github.com/CLIUtils/CLI11/actions
-[codecov-badge]: https://codecov.io/gh/CLIUtils/CLI11/branch/master/graph/badge.svg
+[repology-badge]: https://repology.org/badge/latest-versions/cli11.svg
+[repology]: https://repology.org/project/cli11/versions
+[codecov-badge]: https://codecov.io/gh/CLIUtils/CLI11/branch/main/graph/badge.svg?token=2O4wfs8NJO
 [codecov]: https://codecov.io/gh/CLIUtils/CLI11
 [gitter-badge]: https://badges.gitter.im/CLI11gitter/Lobby.svg
 [gitter]: https://gitter.im/CLI11gitter/Lobby
@@ -1056,8 +1069,8 @@ CLI11 was developed at the [University of Cincinnati][] to support of the [GooFi
 [version 1.3 post]: https://iscinumpy.gitlab.io/post/announcing-cli11-13/
 [version 1.6 post]: https://iscinumpy.gitlab.io/post/announcing-cli11-16/
 [version 2.0 post]: https://iscinumpy.gitlab.io/post/announcing-cli11-20/
-[wandbox-badge]: https://img.shields.io/badge/try_2.0-online-blue.svg
-[wandbox-link]: https://wandbox.org/permlink/650go2SXpfdvQ7ex
+[wandbox-badge]: https://img.shields.io/badge/try_2.1-online-blue.svg
+[wandbox-link]: https://wandbox.org/permlink/CA5bymNHh0AczdeN
 [releases-badge]: https://img.shields.io/github/release/CLIUtils/CLI11.svg
 [cli11-po-compare]: https://iscinumpy.gitlab.io/post/comparing-cli11-and-boostpo/
 [diana slides]: https://indico.cern.ch/event/619465/contributions/2507949/attachments/1448567/2232649/20170424-diana-2.pdf
diff --git a/packages/CLI11/azure-pipelines.yml b/packages/CLI11/azure-pipelines.yml
index c72c748e6f59d6af651e28231b6cbb0d53f8a32d..750ac31d9aa57f93fc36992c675ff4e0d9d99b3f 100644
--- a/packages/CLI11/azure-pipelines.yml
+++ b/packages/CLI11/azure-pipelines.yml
@@ -4,11 +4,11 @@
 # https://docs.microsoft.com/azure/devops/pipelines/apps/c-cpp/gcc
 
 trigger:
-- master
+- main
 - 'v*'
 
 pr:
-- master
+- main
 - 'v*'
 
 variables:
diff --git a/packages/CLI11/book/README.md b/packages/CLI11/book/README.md
index 593adbc96dd168edefb419b4f470366d109640e1..917e753fe0876806d6d13ad6636e3ce18faaf885 100644
--- a/packages/CLI11/book/README.md
+++ b/packages/CLI11/book/README.md
@@ -28,7 +28,7 @@ Like any good command line application, help is provided. This program can be im
 
 [include](code/intro.cpp)
 
-[Source code](https://github.com/CLIUtils/CLI11/blob/master/book/code/intro.cpp)
+[Source code](https://github.com/CLIUtils/CLI11/blob/main/book/code/intro.cpp)
 
 Unlike some other libraries, this is enough to exit correctly and cleanly if help is requested or if incorrect arguments are passed. You can try this example out for yourself. To compile with GCC:
 
@@ -58,7 +58,7 @@ CLI11 was developed at the [University of Cincinnati][] in support of the [GooFi
 [cli11tutorial]: https://cliutils.github.io/CLI11/book
 [releases]: https://github.com/CLIUtils/CLI11/releases
 [api docs]: https://cliutils.github.io/CLI11
-[readme]: https://github.com/CLIUtils/CLI11/blob/master/README.md
+[readme]: https://github.com/CLIUtils/CLI11/blob/main/README.md
 [nsf 1414736]: https://nsf.gov/awardsearch/showAward?AWD_ID=1414736
 [university of cincinnati]: https://www.uc.edu
 [plumbum]: https://plumbum.readthedocs.io/en/latest/
diff --git a/packages/CLI11/book/chapters/an-advanced-example.md b/packages/CLI11/book/chapters/an-advanced-example.md
index 82fe7b71ce1dabe7ee0de41a3b04f7575ac618e3..84e838ebf32aa0b909dee1fe75e2316b60d305ad 100644
--- a/packages/CLI11/book/chapters/an-advanced-example.md
+++ b/packages/CLI11/book/chapters/an-advanced-example.md
@@ -16,7 +16,7 @@ All that's need now is the parse call. We'll print a little message after the co
 
 [include:"Parse"](../code/geet.cpp)
 
-[Source code](https://github.com/CLIUtils/CLI11/tree/master/book/code/geet.cpp)
+[Source code](https://github.com/CLIUtils/CLI11/tree/main/book/code/geet.cpp)
 
 If you compile and run:
 
diff --git a/packages/CLI11/book/chapters/config.md b/packages/CLI11/book/chapters/config.md
index df004ce75c17a849a92d897b5dc59b00f7470700..79295bdbfc283e8d7eaa3ff42bdd332dec5d9b57 100644
--- a/packages/CLI11/book/chapters/config.md
+++ b/packages/CLI11/book/chapters/config.md
@@ -195,7 +195,7 @@ Finally, set your new class as new config formatter:
 app.config_formatter(std::make_shared<NewConfig>());
 ```
 
-See [`examples/json.cpp`](https://github.com/CLIUtils/CLI11/blob/master/examples/json.cpp) for a complete JSON config example.
+See [`examples/json.cpp`](https://github.com/CLIUtils/CLI11/blob/main/examples/json.cpp) for a complete JSON config example.
 
 ### Trivial JSON configuration example
 
diff --git a/packages/CLI11/book/chapters/flags.md b/packages/CLI11/book/chapters/flags.md
index aa920fe8572c3e70079201a033db28b8add89aff..4e269acdf73dd1a87cce0f6e1a7c4bd8347aeefd 100644
--- a/packages/CLI11/book/chapters/flags.md
+++ b/packages/CLI11/book/chapters/flags.md
@@ -97,7 +97,7 @@ The values would be used like this:
 
 [include:"usage"](../code/flags.cpp)
 
-[Source code](https://github.com/CLIUtils/CLI11/tree/master/book/code/flags.cpp)
+[Source code](https://github.com/CLIUtils/CLI11/tree/main/book/code/flags.cpp)
 
 If you compile and run:
 
diff --git a/packages/CLI11/docs/Doxyfile b/packages/CLI11/docs/Doxyfile
index d08a2902e11ab318fce026f068dfb412780554f0..c86b0ba7283129aaa4916f896a34f44d07e0967c 100644
--- a/packages/CLI11/docs/Doxyfile
+++ b/packages/CLI11/docs/Doxyfile
@@ -1325,7 +1325,7 @@ CHM_FILE               =
 HHC_LOCATION           =
 
 # The GENERATE_CHI flag controls if a separate .chi index file is generated
-# (YES) or that it should be included in the master .chm file (NO).
+# (YES) or that it should be included in the main .chm file (NO).
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
diff --git a/packages/CLI11/include/CLI/App.hpp b/packages/CLI11/include/CLI/App.hpp
index 03b58d9fe7cba33953435483caa031630d6a66aa..803f0f7f62e384c63745d5904fb1501565e979b8 100644
--- a/packages/CLI11/include/CLI/App.hpp
+++ b/packages/CLI11/include/CLI/App.hpp
@@ -1739,10 +1739,10 @@ class App {
     /// Get a pointer to the version option. (const)
     const Option *get_version_ptr() const { return version_ptr_; }
 
-    /// Get the parent of this subcommand (or nullptr if master app)
+    /// Get the parent of this subcommand (or nullptr if main app)
     App *get_parent() { return parent_; }
 
-    /// Get the parent of this subcommand (or nullptr if master app) (const version)
+    /// Get the parent of this subcommand (or nullptr if main app) (const version)
     const App *get_parent() const { return parent_; }
 
     /// Get the name of the current app
@@ -2456,7 +2456,7 @@ class App {
     }
 
     /// Parse "one" argument (some may eat more than one), delegate to parent if fails, add to missing if missing
-    /// from master return false if the parse has failed and needs to return to parent
+    /// from main return false if the parse has failed and needs to return to parent
     bool _parse_single(std::vector<std::string> &args, bool &positional_only) {
         bool retval = true;
         detail::Classifier classifier = positional_only ? detail::Classifier::NONE : _recognize(args.back());
@@ -2731,7 +2731,7 @@ class App {
                     }
                 }
             }
-            // If a subcommand, try the master command
+            // If a subcommand, try the main command
             if(parent_ != nullptr && fallthrough_)
                 return _get_fallthrough_parent()->_parse_arg(args, current_type);
             // don't capture missing if this is a nameless subcommand
@@ -3169,25 +3169,25 @@ struct AppFriend {
 #ifdef CLI11_CPP14
 
     /// Wrap _parse_short, perfectly forward arguments and return
-    template <typename... Args> static decltype(auto) parse_arg(App *app, Args &&... args) {
+    template <typename... Args> static decltype(auto) parse_arg(App *app, Args &&...args) {
         return app->_parse_arg(std::forward<Args>(args)...);
     }
 
     /// Wrap _parse_subcommand, perfectly forward arguments and return
-    template <typename... Args> static decltype(auto) parse_subcommand(App *app, Args &&... args) {
+    template <typename... Args> static decltype(auto) parse_subcommand(App *app, Args &&...args) {
         return app->_parse_subcommand(std::forward<Args>(args)...);
     }
 #else
     /// Wrap _parse_short, perfectly forward arguments and return
     template <typename... Args>
-    static auto parse_arg(App *app, Args &&... args) ->
+    static auto parse_arg(App *app, Args &&...args) ->
         typename std::result_of<decltype (&App::_parse_arg)(App, Args...)>::type {
         return app->_parse_arg(std::forward<Args>(args)...);
     }
 
     /// Wrap _parse_subcommand, perfectly forward arguments and return
     template <typename... Args>
-    static auto parse_subcommand(App *app, Args &&... args) ->
+    static auto parse_subcommand(App *app, Args &&...args) ->
         typename std::result_of<decltype (&App::_parse_subcommand)(App, Args...)>::type {
         return app->_parse_subcommand(std::forward<Args>(args)...);
     }
diff --git a/packages/CLI11/include/CLI/Option.hpp b/packages/CLI11/include/CLI/Option.hpp
index 616cd120cb292d94f4e533e0186c70a483500521..25a676055fba34d7448e263df4e94be526de002e 100644
--- a/packages/CLI11/include/CLI/Option.hpp
+++ b/packages/CLI11/include/CLI/Option.hpp
@@ -781,7 +781,7 @@ class Option : public OptionBase<Option> {
     /// Use `get_name(true)` to get the positional name (replaces `get_pname`)
     std::string get_name(bool positional = false,  ///< Show the positional name
                          bool all_options = false  ///< Show every option
-                         ) const {
+    ) const {
         if(get_group().empty())
             return {};  // Hidden
 
diff --git a/packages/CLI11/include/CLI/TypeTools.hpp b/packages/CLI11/include/CLI/TypeTools.hpp
index 2b87ec60a84dbec5cc0ea6b8d05649e8c8f9d605..0fa2299758d56b8b471fd8c6bc563aa08113459c 100644
--- a/packages/CLI11/include/CLI/TypeTools.hpp
+++ b/packages/CLI11/include/CLI/TypeTools.hpp
@@ -36,7 +36,7 @@ constexpr enabler dummy = {};
 /// A copy of enable_if_t from C++14, compatible with C++11.
 ///
 /// We could check to see if C++14 is being used, but it does not hurt to redefine this
-/// (even Google does this: https://github.com/google/skia/blob/master/include/private/SkTLogic.h)
+/// (even Google does this: https://github.com/google/skia/blob/main/include/private/SkTLogic.h)
 /// It is not in the std namespace anyway, so no harm done.
 template <bool B, class T = void> using enable_if_t = typename std::enable_if<B, T>::type;
 
diff --git a/packages/CLI11/include/CLI/Validators.hpp b/packages/CLI11/include/CLI/Validators.hpp
index 3c8b2f42036a39ed8e1bd7b3bc2e631555072250..03eb77b6ffbe290ad130daedd7227892ebeec356 100644
--- a/packages/CLI11/include/CLI/Validators.hpp
+++ b/packages/CLI11/include/CLI/Validators.hpp
@@ -676,7 +676,7 @@ class IsMember : public Validator {
 
     /// This allows in-place construction using an initializer list
     template <typename T, typename... Args>
-    IsMember(std::initializer_list<T> values, Args &&... args)
+    IsMember(std::initializer_list<T> values, Args &&...args)
         : IsMember(std::vector<T>(values), std::forward<Args>(args)...) {}
 
     /// This checks to see if an item is in a set (empty function)
@@ -728,7 +728,7 @@ class IsMember : public Validator {
 
     /// You can pass in as many filter functions as you like, they nest (string only currently)
     template <typename T, typename... Args>
-    IsMember(T &&set, filter_fn_t filter_fn_1, filter_fn_t filter_fn_2, Args &&... other)
+    IsMember(T &&set, filter_fn_t filter_fn_1, filter_fn_t filter_fn_2, Args &&...other)
         : IsMember(
               std::forward<T>(set),
               [filter_fn_1, filter_fn_2](std::string a) { return filter_fn_2(filter_fn_1(a)); },
@@ -745,7 +745,7 @@ class Transformer : public Validator {
 
     /// This allows in-place construction
     template <typename... Args>
-    Transformer(std::initializer_list<std::pair<std::string, std::string>> values, Args &&... args)
+    Transformer(std::initializer_list<std::pair<std::string, std::string>> values, Args &&...args)
         : Transformer(TransformPairs<std::string>(values), std::forward<Args>(args)...) {}
 
     /// direct map of std::string to std::string
@@ -789,7 +789,7 @@ class Transformer : public Validator {
 
     /// You can pass in as many filter functions as you like, they nest
     template <typename T, typename... Args>
-    Transformer(T &&mapping, filter_fn_t filter_fn_1, filter_fn_t filter_fn_2, Args &&... other)
+    Transformer(T &&mapping, filter_fn_t filter_fn_1, filter_fn_t filter_fn_2, Args &&...other)
         : Transformer(
               std::forward<T>(mapping),
               [filter_fn_1, filter_fn_2](std::string a) { return filter_fn_2(filter_fn_1(a)); },
@@ -803,7 +803,7 @@ class CheckedTransformer : public Validator {
 
     /// This allows in-place construction
     template <typename... Args>
-    CheckedTransformer(std::initializer_list<std::pair<std::string, std::string>> values, Args &&... args)
+    CheckedTransformer(std::initializer_list<std::pair<std::string, std::string>> values, Args &&...args)
         : CheckedTransformer(TransformPairs<std::string>(values), std::forward<Args>(args)...) {}
 
     /// direct map of std::string to std::string
@@ -865,7 +865,7 @@ class CheckedTransformer : public Validator {
 
     /// You can pass in as many filter functions as you like, they nest
     template <typename T, typename... Args>
-    CheckedTransformer(T &&mapping, filter_fn_t filter_fn_1, filter_fn_t filter_fn_2, Args &&... other)
+    CheckedTransformer(T &&mapping, filter_fn_t filter_fn_1, filter_fn_t filter_fn_2, Args &&...other)
         : CheckedTransformer(
               std::forward<T>(mapping),
               [filter_fn_1, filter_fn_2](std::string a) { return filter_fn_2(filter_fn_1(a)); },
diff --git a/packages/CLI11/include/CLI/Version.hpp b/packages/CLI11/include/CLI/Version.hpp
index c989ae86d4cc559f0491ce4a88c71e26d87fbf7b..4bc79a7737a558b264fac0f460c4dfcd324b7075 100644
--- a/packages/CLI11/include/CLI/Version.hpp
+++ b/packages/CLI11/include/CLI/Version.hpp
@@ -10,7 +10,7 @@
 
 #define CLI11_VERSION_MAJOR 2
 #define CLI11_VERSION_MINOR 1
-#define CLI11_VERSION_PATCH 1
-#define CLI11_VERSION "2.1.1"
+#define CLI11_VERSION_PATCH 2
+#define CLI11_VERSION "2.1.2"
 
 // [CLI11:version_hpp:end]
diff --git a/packages/CLI11/tests/CMakeLists.txt b/packages/CLI11/tests/CMakeLists.txt
index c322615f638715b988d17ea0addf102595b4b890..80c4f6a83ec93df312aaf0c19279141b02f9055e 100644
--- a/packages/CLI11/tests/CMakeLists.txt
+++ b/packages/CLI11/tests/CMakeLists.txt
@@ -63,23 +63,33 @@ endif()
 
 set(CLI11_MULTIONLY_TESTS TimerTest)
 
-add_library(catch_main main.cpp)
+add_library(catch_main main.cpp catch.hpp)
 target_include_directories(catch_main PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")
 
-# Currently a required download; could be make to look for existing Catch2, but
-# that would require changing the includes. FetchContent would be better, but
-# requires newer CMake.
-
-set(url https://github.com/philsquared/Catch/releases/download/v2.13.6/catch.hpp)
-file(
-  DOWNLOAD ${url} "${CMAKE_CURRENT_BINARY_DIR}/catch.hpp"
-  STATUS status
-  EXPECTED_HASH SHA256=681e7505a50887c9085539e5135794fc8f66d8e5de28eadf13a30978627b0f47)
-list(GET status 0 error)
-if(error)
-  message(FATAL_ERROR "Could not download ${url}")
+find_package(Catch2 CONFIG)
+
+if(Catch2_FOUND)
+  if(NOT TARGET Catch2::Catch2)
+    message(FATAL_ERROR "Found Catch2 at ${Catch2_DIR} but targets are missing.")
+  endif()
+  message(STATUS "Found Catch2")
+  target_link_libraries(catch_main PUBLIC Catch2::Catch2)
+else()
+  message(STATUS "Downloading Catch2")
+
+  # FetchContent would be better, but requires newer CMake.
+  file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/catch2")
+  set(url https://github.com/philsquared/Catch/releases/download/v2.13.7/catch.hpp)
+  file(
+    DOWNLOAD ${url} "${CMAKE_CURRENT_BINARY_DIR}/catch2/catch.hpp"
+    STATUS status
+    EXPECTED_HASH SHA256=ea379c4a3cb5799027b1eb451163dff065a3d641aaba23bf4e24ee6b536bd9bc)
+  list(GET status 0 error)
+  if(error)
+    message(FATAL_ERROR "Could not download ${url}, and Catch2 not found on your system.")
+  endif()
+  target_include_directories(catch_main PUBLIC "${CMAKE_CURRENT_BINARY_DIR}")
 endif()
-target_include_directories(catch_main PUBLIC "${CMAKE_CURRENT_BINARY_DIR}")
 
 # Target must already exist
 macro(add_catch_test TESTNAME)
@@ -174,8 +184,6 @@ file(WRITE "${PROJECT_BINARY_DIR}/CTestCustom.cmake"
 target_compile_definitions(informational PRIVATE ${boost-optional-def})
 target_compile_definitions(OptionalTest PRIVATE ${boost-optional-def})
 
-message(STATUS "Boost libs=${Boost_INCLUDE_DIRS}")
-
 if(TARGET Boost::boost)
   message(STATUS "including boost target")
   target_link_libraries(informational PRIVATE Boost::boost)
@@ -185,6 +193,7 @@ if(TARGET Boost::boost)
     target_link_libraries(OptionalTest_Single PRIVATE Boost::boost)
     target_link_libraries(BoostOptionTypeTest_Single PRIVATE Boost::boost)
   endif()
+  message(STATUS "Boost libs=${Boost_INCLUDE_DIRS}")
 elseif(BOOST_FOUND)
   message(STATUS "no boost target")
   target_include_directories(informational PRIVATE ${Boost_INCLUDE_DIRS})
@@ -194,6 +203,9 @@ elseif(BOOST_FOUND)
     target_include_directories(OptionalTest_Single PRIVATE ${Boost_INCLUDE_DIRS})
     target_include_directories(BoostOptionTypeTest_Single PRIVATE ${Boost_INCLUDE_DIRS})
   endif()
+  message(STATUS "Boost libs=${Boost_INCLUDE_DIRS}")
+else()
+  message(STATUS "Boost not found, not adding boost tests")
 endif()
 
 if(CMAKE_BUILD_TYPE STREQUAL Coverage)
diff --git a/packages/CLI11/tests/OptionalTest.cpp b/packages/CLI11/tests/OptionalTest.cpp
index 6b07f01c703e7f26d46d5cabfc2520e6fdb0364e..5ea20be9dbd8beab8ebfed186ac88434a05da55a 100644
--- a/packages/CLI11/tests/OptionalTest.cpp
+++ b/packages/CLI11/tests/OptionalTest.cpp
@@ -258,11 +258,13 @@ TEST_CASE_METHOD(TApp, "BoostOptionalVector", "[optional]") {
            "-v,--vec", [&opt](const std::vector<int> &v) { opt = v; }, "some vector")
         ->expected(3);
     run();
-    CHECK(!opt);
+    bool checkOpt = static_cast<bool>(opt);
+    CHECK(!checkOpt);
 
     args = {"-v", "1", "4", "5"};
     run();
-    CHECK(opt);
+    checkOpt = static_cast<bool>(opt);
+    CHECK(checkOpt);
     std::vector<int> expV{1, 4, 5};
     CHECK(expV == *opt);
 }
@@ -272,14 +274,17 @@ TEST_CASE_METHOD(TApp, "BoostOptionalVectorEmpty", "[optional]") {
     app.add_option<decltype(opt), std::vector<int>>("-v,--vec", opt)->expected(0, 3)->allow_extra_args();
     // app.add_option("-v,--vec", opt)->expected(0, 3)->allow_extra_args();
     run();
-    CHECK(!opt);
+    bool checkOpt = static_cast<bool>(opt);
+    CHECK(!checkOpt);
     args = {"-v"};
     opt = std::vector<int>{4, 3};
     run();
-    CHECK(!opt);
+    checkOpt = static_cast<bool>(opt);
+    CHECK(!checkOpt);
     args = {"-v", "1", "4", "5"};
     run();
-    CHECK(opt);
+    checkOpt = static_cast<bool>(opt);
+    CHECK(checkOpt);
     std::vector<int> expV{1, 4, 5};
     CHECK(expV == *opt);
 }
@@ -289,14 +294,17 @@ TEST_CASE_METHOD(TApp, "BoostOptionalVectorEmptyDirect", "[optional]") {
     app.add_option_no_stream("-v,--vec", opt)->expected(0, 3)->allow_extra_args();
     // app.add_option("-v,--vec", opt)->expected(0, 3)->allow_extra_args();
     run();
-    CHECK(!opt);
+    bool checkOpt = static_cast<bool>(opt);
+    CHECK(!checkOpt);
     args = {"-v"};
     opt = std::vector<int>{4, 3};
     run();
-    CHECK(!opt);
+    checkOpt = static_cast<bool>(opt);
+    CHECK(!checkOpt);
     args = {"-v", "1", "4", "5"};
     run();
-    CHECK(opt);
+    checkOpt = static_cast<bool>(opt);
+    CHECK(checkOpt);
     std::vector<int> expV{1, 4, 5};
     CHECK(expV == *opt);
 }
diff --git a/packages/CLI11/tests/catch.hpp b/packages/CLI11/tests/catch.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2aaeae76e3c58c517e0212ef0a31d0fe79db7dea
--- /dev/null
+++ b/packages/CLI11/tests/catch.hpp
@@ -0,0 +1,9 @@
+// Copyright (c) 2017-2021, University of Cincinnati, developed by Henry Schreiner
+// under NSF AWARD 1414736 and by the respective contributors.
+// All rights reserved.
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+#include <catch2/catch.hpp>
diff --git a/packages/PEGTL/.clang-format b/packages/PEGTL/.clang-format
index c423ca2749e808e112ba69db6d6f33ee76ca955f..a3f254129f2326865c1ca172033c1b99f63f3a11 100644
--- a/packages/PEGTL/.clang-format
+++ b/packages/PEGTL/.clang-format
@@ -1,6 +1,9 @@
-# the official .clang-format style for https://github.com/taocpp
-#
-# clang-format -i -style=file $(find . -name '[^.]*.[hc]pp')
+# The Art of C++
+# https://github.com/PEGTL
+
+# Copyright (c) 2016-2021 Daniel Frey
+# Distributed under the Boost Software License, Version 1.0.
+# (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 Language: Cpp
 Standard: Latest
diff --git a/packages/PEGTL/.clang-tidy b/packages/PEGTL/.clang-tidy
index cad3a9f3fc344995216d51ad6ae3df9821f03a0f..b98e0d8b54747e3497f2e0376815d17fa55c98ac 100644
--- a/packages/PEGTL/.clang-tidy
+++ b/packages/PEGTL/.clang-tidy
@@ -1,3 +1,10 @@
+# The Art of C++
+# https://github.com/PEGTL
+
+# Copyright (c) 2016-2021 Daniel Frey
+# Distributed under the Boost Software License, Version 1.0.
+# (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
+
 Checks: >-
   bugprone-*,
   cppcoreguidelines-slicing,
diff --git a/packages/PEGTL/.gitrepo b/packages/PEGTL/.gitrepo
index 873dce466a5e68dd515bc2ca381ad4262367b9c2..279480bff62486421386892258910b04da453b3a 100644
--- a/packages/PEGTL/.gitrepo
+++ b/packages/PEGTL/.gitrepo
@@ -6,7 +6,7 @@
 [subrepo]
 	remote = git@github.com:taocpp/PEGTL.git
 	branch = main
-	commit = fbfebc50d0b84a1fcd40e083a6e630fe815e21a7
-	parent = 8bade9551beef13a194e250386ff539c38cef805
+	commit = bf4487c9793121e483291c4e516cec3e5c1c17b5
+	parent = e00b72ebcd9d2add12cfe0e6fe4d114a7858dfa5
 	method = merge
 	cmdver = 0.4.3
diff --git a/packages/PEGTL/CMakeLists.txt b/packages/PEGTL/CMakeLists.txt
index c2ee336f31b4c18f309adfa94c77db6f99978936..2233099fa6181105a7e6053740984913bc856534 100644
--- a/packages/PEGTL/CMakeLists.txt
+++ b/packages/PEGTL/CMakeLists.txt
@@ -147,7 +147,7 @@ install(EXPORT pegtl-targets
 
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/pegtl-config-version.cmake DESTINATION ${PEGTL_INSTALL_CMAKE_DIR})
 install(DIRECTORY include/ DESTINATION ${PEGTL_INSTALL_INCLUDE_DIR})
-install(FILES LICENSE DESTINATION ${PEGTL_INSTALL_DOC_DIR})
+install(FILES LICENSE_1_0.txt DESTINATION ${PEGTL_INSTALL_DOC_DIR})
 
 export(EXPORT pegtl-targets
   FILE ${pegtl_BINARY_DIR}/pegtl-targets.cmake
diff --git a/packages/PEGTL/LICENSE b/packages/PEGTL/LICENSE
deleted file mode 100644
index c981590829a1ee30e859ac04a31844398be0ae8e..0000000000000000000000000000000000000000
--- a/packages/PEGTL/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-The MIT License (MIT)
-
-Copyright (c) 2007-2021 Dr. Colin Hirsch and Daniel Frey
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/packages/PEGTL/LICENSE_1_0.txt b/packages/PEGTL/LICENSE_1_0.txt
new file mode 100644
index 0000000000000000000000000000000000000000..36b7cd93cdfbac762f5be4c6ce276df2ea6305c2
--- /dev/null
+++ b/packages/PEGTL/LICENSE_1_0.txt
@@ -0,0 +1,23 @@
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/packages/PEGTL/Makefile b/packages/PEGTL/Makefile
index 3ecca070e2264b4af4542b275f0872bf2c2a32a8..773c525bbc936e079e33e33084a45ed33d5e7469 100644
--- a/packages/PEGTL/Makefile
+++ b/packages/PEGTL/Makefile
@@ -1,6 +1,9 @@
 # The Art of C++
+# https://github.com/PEGTL
+
 # Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-# Please see LICENSE for license or visit https://github.com/taocpp/PEGTL
+# Distributed under the Boost Software License, Version 1.0.
+# (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 .SUFFIXES:
 .SECONDARY:
@@ -61,31 +64,6 @@ build/%.d: %.cpp Makefile
 build/%: %.cpp build/%.d
 	$(CXX) $(CXXSTD) -Iinclude $(CPPFLAGS) $(CXXFLAGS) $< $(LDFLAGS) -o $@
 
-.PHONY: amalgamate
-amalgamate: build/amalgamated/pegtl.hpp
-
-build/amalgamated/pegtl.hpp: $(HEADERS)
-	@mkdir -p $(@D)
-	@rm -rf build/include
-	@cp -a include build/
-	@rm -rf build/include/tao/pegtl/contrib/icu
-	@sed -i -e 's%^#%//#%g' $$(find build/include -name '*.hpp')
-	@sed -i -e 's%^//#include "%#include "%g' $$(find build/include -name '*.hpp')
-	@for i in $$(find build/include -name '*.hpp'); do echo "#pragma once" >tmp.out; echo "#line 1" >>tmp.out; cat $$i >>tmp.out; mv tmp.out $$i; done
-	@echo '#include "tao/pegtl.hpp"' >build/include/amalgamated.hpp
-	@( cd build/include ; for i in tao/pegtl/contrib/*.hpp; do echo "#include \"$$i\""; done ) >>build/include/amalgamated.hpp
-	@echo -e "/*\n\nWelcome to the Parsing Expression Grammar Template Library (PEGTL)." >$@
-	@echo -e "See https://github.com/taocpp/PEGTL/ for more information, documentation, etc.\n" >>$@
-	@echo -e "The library is licensed as follows:\n" >>$@
-	@cat LICENSE >>$@
-	@echo -e "\n*/\n" >>$@
-	@( cd build/include ; g++ -E -C -nostdinc amalgamated.hpp ) >>$@
-	@sed -i -e 's%^//#%#%g' $@
-	@sed -i -e 's%^# \([0-9]* "[^"]*"\).*%#line \1%g' $@
-	@sed -i -e 's%^// Copyright.*%%g' $@
-	@sed -i -e 's%^// Please.*%%g' $@
-	@echo "Generated/updated $@ successfully."
-
 ifeq ($(findstring $(MAKECMDGOALS),clean),)
 -include $(DEPENDS)
 endif
diff --git a/packages/PEGTL/README.md b/packages/PEGTL/README.md
index 0ee9ddfe6387735875a9383d5d91235ef1105d4a..399b2bb3a7c3e74c628a5df6a11fb7731f30e250 100644
--- a/packages/PEGTL/README.md
+++ b/packages/PEGTL/README.md
@@ -136,16 +136,17 @@ The PEGTL is part of [The Art of C++](https://taocpp.github.io/).
 
 ## License
 
-<a href="https://opensource.org/licenses/MIT"><img align="right" src="https://opensource.org/files/OSIApproved.png" width="150" hspace="20" alt="Open Source Initiative"></a>
+<a href="https://opensource.org/licenses/BSL-1.0"><img align="right" src="https://opensource.org/files/OSIApproved.png" width="150" hspace="20" alt="Open Source Initiative"></a>
+
+Copyright (c) 2007-2021 Daniel Frey and Dr. Colin Hirsch
 
 The PEGTL is certified [Open Source](http://www.opensource.org/docs/definition.html) software.
-It may be used for any purpose, including commercial purposes, at absolutely no cost.
-It is distributed under the terms of the [MIT license](http://www.opensource.org/licenses/mit-license.html) reproduced here.
+It is [licensed](https://pdimov.github.io/blog/2020/09/06/why-use-the-boost-license/) under the terms of the [Boost Software License, Version 1.0](https://www.boost.org/LICENSE_1_0.txt) reproduced here.
 
-> Copyright (c) 2007-2021 Dr. Colin Hirsch and Daniel Frey
+> Boost Software License - Version 1.0 - August 17th, 2003
 >
-> Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+> Permission is hereby granted, free of charge, to any person or organization obtaining a copy of the software and accompanying documentation covered by this license (the "Software") to use, reproduce, display, distribute, execute, and transmit the Software, and to prepare derivative works of the Software, and to permit third-parties to whom the Software is furnished to do so, all subject to the following:
 >
-> The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+> The copyright notices in the Software and this entire statement, including the above license grant, this restriction and the following disclaimer, must be included in all copies of the Software, in whole or in part, and all derivative works of the Software, unless such copies or derivative works are solely in the form of machine-executable object code generated by a source language processor.
 >
-> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/packages/PEGTL/doc/Actions-and-States.md b/packages/PEGTL/doc/Actions-and-States.md
index 4c8101682972bde687cad06b2a1d96024e7afd79..272b8ac460ccd0c15bd2225142a42542e90d0865 100644
--- a/packages/PEGTL/doc/Actions-and-States.md
+++ b/packages/PEGTL/doc/Actions-and-States.md
@@ -383,6 +383,7 @@ The [`state` rule](Rule-Reference.md#state-s-r-) behaves similarly to [`seq`](Ru
 This new object is used replaces the current state(s) for the remainder of the implicit [`seq`](Rule-Reference.md#seq-r-).
 
 The new object is constructed with a const-reference to the current input of the parsing run, and all previous states, if any, as arguments.
+If no such constructor exists, the new object is default constructed.
 If the implicit [`seq`](Rule-Reference.md#seq-r-) of the sub-rules succeeds, then, by default, a member function named `success()` is called on this "new" object, receiving the same arguments as the constructor.
 At this point the input will be advanced by whatever the sub-rules have consumed in the meantime.
 
@@ -399,7 +400,7 @@ The differences are summarised in this table; note that `change_state` is more s
 | Feature | `change_state` | `change_states` |
 | --- | --- | --- |
 | Number of new states | one | any |
-| Construction of new states | with input and old states | default |
+| Construction of new states | optionally with input and old states | default |
 | Success function on action | if not on new state | required |
 
 With `change_state` only a single new state type can be given as template parameter, and only a single new state will be created.
@@ -603,4 +604,10 @@ Note that deriving from `require_apply` or `require_apply0` is optional and usua
 
 See the [section on legacy-style action rules](Rule-Reference.md#action-rules).
 
+---
+
+This document is part of the [PEGTL](https://github.com/taocpp/PEGTL).
+
 Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
+Distributed under the Boost Software License, Version 1.0<br>
+See accompanying file [LICENSE_1_0.txt](../LICENSE_1_0.txt) or copy at https://www.boost.org/LICENSE_1_0.txt
diff --git a/packages/PEGTL/doc/Changelog.md b/packages/PEGTL/doc/Changelog.md
index eed13ec4287a2ed12d38e17369fd9b7b70677d3d..aca8f740a1f549f4f21702bfdd76862fdff29dc8 100644
--- a/packages/PEGTL/doc/Changelog.md
+++ b/packages/PEGTL/doc/Changelog.md
@@ -1,5 +1,25 @@
 # Changelog
 
+## 3.3.0
+
+**Not yet released**
+
+* Switched to Boost Software License, Version 1.0.
+* Removed support for building an amalgamated header.
+* Deprecated the `TAO_PEGTL_NAMESPACE` macro.
+  * Will be removed in version 4.0.0.
+
+## 3.2.2
+
+Released 2021-10-22
+
+* Added rule [`odigit`](Rule-Reference.md#odigit) for octal digits.
+* Enabled default-constructed state in `state<>`, `change_state<>`, and `change_action_and_state<>`.
+* Changed rules in [`tao/pegtl/contrib/integer.hpp`](Contrib-and-Examples.md#taopegtlcontribintegerhpp) to not throw by default.
+* Added [`tao/pegtl/contrib/separated_seq.hpp`](Contrib-and-Examples.md#taopegtlcontribseparated_seqhpp).
+* Added `tao/pegtl/contrib/iri.hpp` grammar for IRIs.
+* Added `tao/pegtl/contrib/proto3.hpp` grammar for protocol buffer v3.
+
 ## 3.2.1
 
 Released 2021-07-31
@@ -627,4 +647,10 @@ Released 2008
 Development of the PEGTL started in November 2007 as an experiment in C++0x.
 It is based on ideas from the YARD library by Christopher Diggins.
 
+---
+
+This document is part of the [PEGTL](https://github.com/taocpp/PEGTL).
+
 Copyright (c) 2007-2021 Dr. Colin Hirsch and Daniel Frey
+Distributed under the Boost Software License, Version 1.0<br>
+See accompanying file [LICENSE_1_0.txt](../LICENSE_1_0.txt) or copy at https://www.boost.org/LICENSE_1_0.txt
diff --git a/packages/PEGTL/doc/Contrib-and-Examples.md b/packages/PEGTL/doc/Contrib-and-Examples.md
index f5b658e7595a33d576feab317ab45dc2f414879a..31f9035c161de1d791d8dc2b42eaf5662b11fcc1 100644
--- a/packages/PEGTL/doc/Contrib-and-Examples.md
+++ b/packages/PEGTL/doc/Contrib-and-Examples.md
@@ -70,6 +70,11 @@ For all questions and remarks contact us at **taocpp(at)icemx.net**.
 * Contains optimised version of `rep_min_max< Min, Max, ascii::one< C > >`:
 * Rule `ascii::rep_one_min_max< Min, Max, C >`.
 
+###### `<tao/pegtl/contrib/separated_seq.hpp>`
+
+* Allows to parse rules separated by a separator.
+* Rule `separated_seq< S, A, B, C, D >` is equivalent to `seq< A, S, B, S, C, S, D >`.
+
 ###### `<tao/pegtl/contrib/to_string.hpp>`
 
 Utility function `to_string<>()` that converts template classes with arbitrary sequences of characters as template arguments into a `std::string` that contains these characters.
@@ -207,4 +212,10 @@ Uses the building blocks from `<tao/pegtl/contrib/unescape.hpp>` to show how to
 Shows how to use the included [tracer control](#taopegtlcontribtracerhpp), here together with the URI grammar from `<tao/pegtl/contrib/uri.hpp>`.
 Invoked with one or more URIs as command line arguments will attempt to parse the URIs while printing trace information to `std::cerr`.
 
+---
+
+This document is part of the [PEGTL](https://github.com/taocpp/PEGTL).
+
 Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
+Distributed under the Boost Software License, Version 1.0<br>
+See accompanying file [LICENSE_1_0.txt](../LICENSE_1_0.txt) or copy at https://www.boost.org/LICENSE_1_0.txt
diff --git a/packages/PEGTL/doc/Control-and-Debug.md b/packages/PEGTL/doc/Control-and-Debug.md
index 3ff6a88787a36b34de2397690b6cfca147edd9f0..10fa8f80a931247c2a4be54e7bacbe954aaa1473 100644
--- a/packages/PEGTL/doc/Control-and-Debug.md
+++ b/packages/PEGTL/doc/Control-and-Debug.md
@@ -144,4 +144,10 @@ Just like the action class template, a custom control class template can be used
 The latter requires the use of a [custom action](Actions-and-States.md).
 Deriving the specialisation of the custom action for `my_rule` from `tao::pegtl::change_control< my_control >` will switch the current control to `my_control` before attempting to match `my_rule`.
 
+---
+
+This document is part of the [PEGTL](https://github.com/taocpp/PEGTL).
+
 Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
+Distributed under the Boost Software License, Version 1.0<br>
+See accompanying file [LICENSE_1_0.txt](../LICENSE_1_0.txt) or copy at https://www.boost.org/LICENSE_1_0.txt
diff --git a/packages/PEGTL/doc/Errors-and-Exceptions.md b/packages/PEGTL/doc/Errors-and-Exceptions.md
index 7b8f018d84e757c919c2eae8f91864ccac87a060..a0b16d24c7bfc74228c504c38624de3cd5df7468 100644
--- a/packages/PEGTL/doc/Errors-and-Exceptions.md
+++ b/packages/PEGTL/doc/Errors-and-Exceptions.md
@@ -214,4 +214,10 @@ struct error
 It is advisable to choose the error points in the grammar with prudence.
 This choice becoming particularly cumbersome and/or resulting in a large number of error points might be an indication of the grammar needing some kind of simplification or restructuring.
 
+---
+
+This document is part of the [PEGTL](https://github.com/taocpp/PEGTL).
+
 Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
+Distributed under the Boost Software License, Version 1.0<br>
+See accompanying file [LICENSE_1_0.txt](../LICENSE_1_0.txt) or copy at https://www.boost.org/LICENSE_1_0.txt
diff --git a/packages/PEGTL/doc/Getting-Started.md b/packages/PEGTL/doc/Getting-Started.md
index 1deceba9f37ab81a91448682e0f39dcdbe1d8943..fbf71cbd304777526e661e0f50e0c2fb1b3eaf38 100644
--- a/packages/PEGTL/doc/Getting-Started.md
+++ b/packages/PEGTL/doc/Getting-Started.md
@@ -283,4 +283,10 @@ Typically, the following pattern helps to print the exceptions in a human friend
 
 For more information see [Errors and Exceptions](Errors-and-Exceptions.md).
 
+---
+
+This document is part of the [PEGTL](https://github.com/taocpp/PEGTL).
+
 Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
+Distributed under the Boost Software License, Version 1.0<br>
+See accompanying file [LICENSE_1_0.txt](../LICENSE_1_0.txt) or copy at https://www.boost.org/LICENSE_1_0.txt
diff --git a/packages/PEGTL/doc/Grammar-Analysis.md b/packages/PEGTL/doc/Grammar-Analysis.md
index 629982d8ec624f154c86fb95e16edacec6704a5a..94bb40aef184878e3a5d7f1c17f520bdefbc1f8c 100644
--- a/packages/PEGTL/doc/Grammar-Analysis.md
+++ b/packages/PEGTL/doc/Grammar-Analysis.md
@@ -124,4 +124,10 @@ In practice it appears to catch all cases of left-recursion that are typical for
 
 False positives are a theoretical problem in that, while relatively easy to trigger, they are not usually encountered when dealing with real world grammars.
 
+---
+
+This document is part of the [PEGTL](https://github.com/taocpp/PEGTL).
+
 Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
+Distributed under the Boost Software License, Version 1.0<br>
+See accompanying file [LICENSE_1_0.txt](../LICENSE_1_0.txt) or copy at https://www.boost.org/LICENSE_1_0.txt
diff --git a/packages/PEGTL/doc/Inputs-and-Parsing.md b/packages/PEGTL/doc/Inputs-and-Parsing.md
index b2bb15f44672010dd71792fbc9e4ee06e0c33d00..96948728c3e62278d94feedb4083a9f764db382c 100644
--- a/packages/PEGTL/doc/Inputs-and-Parsing.md
+++ b/packages/PEGTL/doc/Inputs-and-Parsing.md
@@ -554,4 +554,10 @@ Trying to call any of those functions on `buffer_input<>`-based instances will l
 
 All input classes support [deduction guides](https://en.cppreference.com/w/cpp/language/class_template_argument_deduction), e.g. instead of `file_input<> in( "filename.txt" )` one can use `file_input in( "filename.txt" )`.
 
+---
+
+This document is part of the [PEGTL](https://github.com/taocpp/PEGTL).
+
 Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
+Distributed under the Boost Software License, Version 1.0<br>
+See accompanying file [LICENSE_1_0.txt](../LICENSE_1_0.txt) or copy at https://www.boost.org/LICENSE_1_0.txt
diff --git a/packages/PEGTL/doc/Installing-and-Using.md b/packages/PEGTL/doc/Installing-and-Using.md
index f2e56051dafad0960f1a2e82f451e1586c344cbb..007987116287d8674c22b571c27239ca6e76a0a1 100644
--- a/packages/PEGTL/doc/Installing-and-Using.md
+++ b/packages/PEGTL/doc/Installing-and-Using.md
@@ -19,7 +19,6 @@
   * [Embedding in Binaries](#embedding-in-binaries)
   * [Embedding in Libraries](#embedding-in-libraries)
   * [Embedding in Library Interfaces](#embedding-in-library-interfaces)
-* [Single Header Version](#single-header-version)
 
 ## Requirements
 
@@ -98,8 +97,6 @@ Note that some of the listed packages are not updated regularly.
 
 ## Using Vcpkg
 
-[![Vcpkg package](https://repology.org/badge/version-for-repo/vcpkg/pegtl.svg)](https://repology.org/project/pegtl/versions)
-
 You can download and install the PEGTL using the [Vcpkg] package manager:
 
 ```bash
@@ -301,20 +298,13 @@ above; alternatively `include/tao/pegtl/config.hpp` can be directly modified.
 A practical example of how the result looks like can be found in our own
 header-only JSON library [taoJSON](https://github.com/taocpp/json/).
 
-## Single Header Version
-
-You can generate a single-header-version of the PEGTL with the included `Makefile`.
-In a Unix-shell, the following command will achieve this:
-
-```sh
-$ make amalgamate
-```
+---
 
-The above will generate a `build/amalgamated/pegtl.hpp` which will consist of
-the headers `tao/pegtl.hpp`, their dependencies, and all headers in
-`tao/pegtl/contrib/` except for the headers in `tao/pegtl/contrib/icu/`.
+This document is part of the [PEGTL](https://github.com/taocpp/PEGTL).
 
 Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
+Distributed under the Boost Software License, Version 1.0<br>
+See accompanying file [LICENSE_1_0.txt](../LICENSE_1_0.txt) or copy at https://www.boost.org/LICENSE_1_0.txt
 
 [Vcpkg]: https://github.com/Microsoft/vcpkg
 [Vcpkg documentation]: https://github.com/Microsoft/vcpkg/tree/master/docs/index.md
diff --git a/packages/PEGTL/doc/Meta-Data-and-Visit.md b/packages/PEGTL/doc/Meta-Data-and-Visit.md
index 74f8672ca0d25e8d571b9dab9ff746b0c6034371..bed412efa030a9d8638dbee233b651a75276f7b9 100644
--- a/packages/PEGTL/doc/Meta-Data-and-Visit.md
+++ b/packages/PEGTL/doc/Meta-Data-and-Visit.md
@@ -147,4 +147,10 @@ struct coverage_result
 
 As usual, unless otherwise indicated, all functions and data structure are in the namespace `tao::pegtl`.
 
+---
+
+This document is part of the [PEGTL](https://github.com/taocpp/PEGTL).
+
 Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
+Distributed under the Boost Software License, Version 1.0<br>
+See accompanying file [LICENSE_1_0.txt](../LICENSE_1_0.txt) or copy at https://www.boost.org/LICENSE_1_0.txt
diff --git a/packages/PEGTL/doc/Migration-Guide.md b/packages/PEGTL/doc/Migration-Guide.md
index 91171f3b46050727eb42df083621ef46dc891c4c..19f543d970ad7274f79811b58c93c613850fe1f2 100644
--- a/packages/PEGTL/doc/Migration-Guide.md
+++ b/packages/PEGTL/doc/Migration-Guide.md
@@ -56,4 +56,10 @@ Please contact the authors at `taocpp(at)icemx.net` for any further questions wh
 There were [many important changes](Changelog.md#100) leading up to version 1.0.0.
 Please contact the authors at `taocpp(at)icemx.net` for any further questions when updating the PEGTL.
 
+---
+
+This document is part of the [PEGTL](https://github.com/taocpp/PEGTL).
+
 Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
+Distributed under the Boost Software License, Version 1.0<br>
+See accompanying file [LICENSE_1_0.txt](../LICENSE_1_0.txt) or copy at https://www.boost.org/LICENSE_1_0.txt
diff --git a/packages/PEGTL/doc/Parse-Tree.md b/packages/PEGTL/doc/Parse-Tree.md
index 8e668f9f8e2fe78b654d8d423a17f969874bad29..e82f6cf7d49d932986bad8744bcbc7b386afc84b 100644
--- a/packages/PEGTL/doc/Parse-Tree.md
+++ b/packages/PEGTL/doc/Parse-Tree.md
@@ -240,4 +240,10 @@ struct my_node
 
 The parse tree uses a rule's meta data supplied by [`subs_t`](Meta-Data-and-Visit.md#sub-rules) for internal optimizations.
 
+---
+
+This document is part of the [PEGTL](https://github.com/taocpp/PEGTL).
+
 Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
+Distributed under the Boost Software License, Version 1.0<br>
+See accompanying file [LICENSE_1_0.txt](../LICENSE_1_0.txt) or copy at https://www.boost.org/LICENSE_1_0.txt
diff --git a/packages/PEGTL/doc/Performance-Notes.md b/packages/PEGTL/doc/Performance-Notes.md
index 6acf14257286df470492ddb8f2ee20093c121217..bad3886260a3498bbb2663ab06f9195627195e7d 100644
--- a/packages/PEGTL/doc/Performance-Notes.md
+++ b/packages/PEGTL/doc/Performance-Notes.md
@@ -42,4 +42,10 @@ However with `-O0`, the optimised `at_one< '"' >` was faster by 5-10% in a [JSON
 
 We still need to test whether the compiler manages to perform the same optimisation in more complex cases.
 
+---
+
+This document is part of the [PEGTL](https://github.com/taocpp/PEGTL).
+
 Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
+Distributed under the Boost Software License, Version 1.0<br>
+See accompanying file [LICENSE_1_0.txt](../LICENSE_1_0.txt) or copy at https://www.boost.org/LICENSE_1_0.txt
diff --git a/packages/PEGTL/doc/README.md b/packages/PEGTL/doc/README.md
index 7c028ccae0f67e1a8581605c48f4fb5f44491f1a..88bedb59217372fbd3a69cfa2b022e2e52806f89 100644
--- a/packages/PEGTL/doc/README.md
+++ b/packages/PEGTL/doc/README.md
@@ -20,7 +20,6 @@
     * [Embedding in Binaries](Installing-and-Using.md#embedding-in-binaries)
     * [Embedding in Libraries](Installing-and-Using.md#embedding-in-libraries)
     * [Embedding in Library Interfaces](Installing-and-Using.md#embedding-in-library-interfaces)
-  * [Single Header Version](Installing-and-Using.md#single-header-version)
 * [Rules and Grammars](Rules-and-Grammars.md)
   * [Combining Existing Rules](Rules-and-Grammars.md#combining-existing-rules)
   * [Toy S-Expression Grammar](Rules-and-Grammars.md#toy-s-expression-grammar)
@@ -227,6 +226,7 @@
 * [`not_range< C, D >`](Rule-Reference.md#not_range-c-d--2) <sup>[(binary rules)](Rule-Reference.md#binary-rules)</sup>
 * [`nul`](Rule-Reference.md#nul) <sup>[(ascii rules)](Rule-Reference.md#ascii-rules)</sup>
 * [`numeric_type< V >`](Rule-Reference.md#numeric_type-v-) <sup>[(icu rules)](Rule-Reference.md#icu-rules-for-enumerated-properties)</sup>
+* [`odigit`](Rule-Reference.md#odigit) <sup>[(ascii rules)](Rule-Reference.md#ascii-rules)</sup>
 * [`one< C... >`](Rule-Reference.md#one-c-) <sup>[(ascii rules)](Rule-Reference.md#ascii-rules)</sup>
 * [`one< C... >`](Rule-Reference.md#one-c--1) <sup>[(unicode rules)](Rule-Reference.md#unicode-rules)</sup>
 * [`one< C... >`](Rule-Reference.md#one-c--2) <sup>[(binary rules)](Rule-Reference.md#binary-rules)</sup>
@@ -300,4 +300,10 @@
 * [`xid_continue`](Rule-Reference.md#xid_continue) <sup>[(icu rules)](Rule-Reference.md#icu-rules-for-binary-properties)</sup>
 * [`xid_start`](Rule-Reference.md#xid_start) <sup>[(icu rules)](Rule-Reference.md#icu-rules-for-binary-properties)</sup>
 
+---
+
+This document is part of the [PEGTL](https://github.com/taocpp/PEGTL).
+
 Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
+Distributed under the Boost Software License, Version 1.0<br>
+See accompanying file [LICENSE_1_0.txt](../LICENSE_1_0.txt) or copy at https://www.boost.org/LICENSE_1_0.txt
diff --git a/packages/PEGTL/doc/Rule-Reference.md b/packages/PEGTL/doc/Rule-Reference.md
index 79dd46ccf969050b4f40fbafc9ebe56bee2c5699..b1a6757a68976bc2d9c091c150470707f64a54fa 100644
--- a/packages/PEGTL/doc/Rule-Reference.md
+++ b/packages/PEGTL/doc/Rule-Reference.md
@@ -723,6 +723,13 @@ ASCII rules do not usually rely on other rules.
 * [Equivalent] to `one< '\0' >`.
   - `ascii::nul::rule_t` is `internal::one< result_on_found::success, internal::peek_char, 0 >`
 
+###### `odigit`
+
+* Matches and consumes a single ASCII octal digit character.
+* [Equivalent] to `range< '0', '7' >`.
+* [Meta data] and [implementation] mapping:
+  - `ascii::digit::rule_t` is `internal::range< internal::result_on_found::success, internal::peek_char, '0', '7' >`
+
 ###### `one< C... >`
 
 * Succeeds when the input is not empty, and:
@@ -1542,7 +1549,13 @@ Binary rules do not rely on other rules.
 * [`xid_continue`](#xid_continue) <sup>[(icu rules)](#icu-rules-for-binary-properties)</sup>
 * [`xid_start`](#xid_start) <sup>[(icu rules)](#icu-rules-for-binary-properties)</sup>
 
+---
+
+This document is part of the [PEGTL](https://github.com/taocpp/PEGTL).
+
 Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
+Distributed under the Boost Software License, Version 1.0<br>
+See accompanying file [LICENSE_1_0.txt](../LICENSE_1_0.txt) or copy at https://www.boost.org/LICENSE_1_0.txt
 
 [Equivalent]: #equivalence
 [implementation]: #implementation
diff --git a/packages/PEGTL/doc/Rules-and-Grammars.md b/packages/PEGTL/doc/Rules-and-Grammars.md
index 675ebfc3473a6e8f4407ec4e2ac7bebd8f1e8e88..f2242c0f4ab61786dc27b0300f4842adba1dccd9 100644
--- a/packages/PEGTL/doc/Rules-and-Grammars.md
+++ b/packages/PEGTL/doc/Rules-and-Grammars.md
@@ -398,4 +398,10 @@ long literal id was: "fraggle"
 long literal body was: "[foo["
 ```
 
+---
+
+This document is part of the [PEGTL](https://github.com/taocpp/PEGTL).
+
 Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
+Distributed under the Boost Software License, Version 1.0<br>
+See accompanying file [LICENSE_1_0.txt](../LICENSE_1_0.txt) or copy at https://www.boost.org/LICENSE_1_0.txt
diff --git a/packages/PEGTL/include/tao/pegtl.hpp b/packages/PEGTL/include/tao/pegtl.hpp
index 35cc4c6dcb31d6fea988f3df95b4208e6adef58d..982d0759af5339d756f1af83776ca9e67118d353 100644
--- a/packages/PEGTL/include/tao/pegtl.hpp
+++ b/packages/PEGTL/include/tao/pegtl.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_HPP
 #define TAO_PEGTL_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/apply_mode.hpp b/packages/PEGTL/include/tao/pegtl/apply_mode.hpp
index 10b98a6668276d8c7aa39e1f5f6d0d6235b11c90..e4517c6e8734be54d0c3939c590fe42aefd16ffe 100644
--- a/packages/PEGTL/include/tao/pegtl/apply_mode.hpp
+++ b/packages/PEGTL/include/tao/pegtl/apply_mode.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_APPLY_MODE_HPP
 #define TAO_PEGTL_APPLY_MODE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/argv_input.hpp b/packages/PEGTL/include/tao/pegtl/argv_input.hpp
index 89eb3d0242168b0fec0e4c391e48e5ce65d3f233..0518082ad067186846073f65845dfab233df883e 100644
--- a/packages/PEGTL/include/tao/pegtl/argv_input.hpp
+++ b/packages/PEGTL/include/tao/pegtl/argv_input.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_ARGV_INPUT_HPP
 #define TAO_PEGTL_ARGV_INPUT_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/ascii.hpp b/packages/PEGTL/include/tao/pegtl/ascii.hpp
index 8e7b19e29789e2ea0df4e87627e69e810696ce70..73bb2cf5001697be0cfc2c4536a88cd0f2e1f6f5 100644
--- a/packages/PEGTL/include/tao/pegtl/ascii.hpp
+++ b/packages/PEGTL/include/tao/pegtl/ascii.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_ASCII_HPP
 #define TAO_PEGTL_ASCII_HPP
@@ -30,6 +31,7 @@ namespace TAO_PEGTL_NAMESPACE
       template< char... Cs > struct not_one : internal::one< internal::result_on_found::failure, internal::peek_char, Cs... > {};
       template< char Lo, char Hi > struct not_range : internal::range< internal::result_on_found::failure, internal::peek_char, Lo, Hi > {};
       struct nul : internal::one< internal::result_on_found::success, internal::peek_char, char( 0 ) > {};
+      struct odigit : internal::range< internal::result_on_found::success, internal::peek_char, '0', '7' > {};
       template< char... Cs > struct one : internal::one< internal::result_on_found::success, internal::peek_char, Cs... > {};
       struct print : internal::range< internal::result_on_found::success, internal::peek_char, char( 32 ), char( 126 ) > {};
       template< char Lo, char Hi > struct range : internal::range< internal::result_on_found::success, internal::peek_char, Lo, Hi > {};
diff --git a/packages/PEGTL/include/tao/pegtl/buffer_input.hpp b/packages/PEGTL/include/tao/pegtl/buffer_input.hpp
index 0a0ea749447bf09d08da336a4bd94a05fd723b3d..4ea21e7c6c4f91fd258ad2865dd1f75515e7e1a5 100644
--- a/packages/PEGTL/include/tao/pegtl/buffer_input.hpp
+++ b/packages/PEGTL/include/tao/pegtl/buffer_input.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2016-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_BUFFER_INPUT_HPP
 #define TAO_PEGTL_BUFFER_INPUT_HPP
@@ -160,7 +161,7 @@ namespace TAO_PEGTL_NAMESPACE
             std::terminate();
 #endif
          }
-         if( const auto r = m_reader( m_end, (std::min)( buffer_free_after_end(), (std::max)( amount - buffer_occupied(), Chunk ) ) ) ) {
+         if( const auto r = m_reader( m_end, ( std::min )( buffer_free_after_end(), ( std::max )( amount - buffer_occupied(), Chunk ) ) ) ) {
             m_end += r;
          }
       }
diff --git a/packages/PEGTL/include/tao/pegtl/change_action.hpp b/packages/PEGTL/include/tao/pegtl/change_action.hpp
index 6e049ea958fd593a21c145bc7d1abebd0968a0d0..75496d45fe92dfa811cd6950cde436b2929450fd 100644
--- a/packages/PEGTL/include/tao/pegtl/change_action.hpp
+++ b/packages/PEGTL/include/tao/pegtl/change_action.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CHANGE_ACTION_HPP
 #define TAO_PEGTL_CHANGE_ACTION_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/change_action_and_state.hpp b/packages/PEGTL/include/tao/pegtl/change_action_and_state.hpp
index fb67181e123a87bffaf7860d7c80bbb0cd2c82ba..248c6fedbeb9599dafd54b6a2d19b07d2bd7477a 100644
--- a/packages/PEGTL/include/tao/pegtl/change_action_and_state.hpp
+++ b/packages/PEGTL/include/tao/pegtl/change_action_and_state.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CHANGE_ACTION_AND_STATE_HPP
 #define TAO_PEGTL_CHANGE_ACTION_AND_STATE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/change_action_and_states.hpp b/packages/PEGTL/include/tao/pegtl/change_action_and_states.hpp
index 6c452e04be4adcc346d2ff88ba180d7b114ccc19..9d0614f2f0620c10aeaf3ce67c049cb58ed7667f 100644
--- a/packages/PEGTL/include/tao/pegtl/change_action_and_states.hpp
+++ b/packages/PEGTL/include/tao/pegtl/change_action_and_states.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CHANGE_ACTION_AND_STATES_HPP
 #define TAO_PEGTL_CHANGE_ACTION_AND_STATES_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/change_control.hpp b/packages/PEGTL/include/tao/pegtl/change_control.hpp
index b77a238041ef2fb6586e213c25745ae95113d9fb..e825e235903cff1d30a2d7bd44d6018f96aa02d5 100644
--- a/packages/PEGTL/include/tao/pegtl/change_control.hpp
+++ b/packages/PEGTL/include/tao/pegtl/change_control.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CHANGE_CONTROL_HPP
 #define TAO_PEGTL_CHANGE_CONTROL_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/change_state.hpp b/packages/PEGTL/include/tao/pegtl/change_state.hpp
index 9169c078f493510c686aef607a7ecbfd2a33557c..549ff4d7bc3c8f8cb3469f28af39103d4cca6e90 100644
--- a/packages/PEGTL/include/tao/pegtl/change_state.hpp
+++ b/packages/PEGTL/include/tao/pegtl/change_state.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CHANGE_STATE_HPP
 #define TAO_PEGTL_CHANGE_STATE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/change_states.hpp b/packages/PEGTL/include/tao/pegtl/change_states.hpp
index 4ec26dd34bd10ca907cfe802980b2cc6d58798a6..59835c266d7911820e13d08e72ba11ba8cb484ed 100644
--- a/packages/PEGTL/include/tao/pegtl/change_states.hpp
+++ b/packages/PEGTL/include/tao/pegtl/change_states.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CHANGE_STATES_HPP
 #define TAO_PEGTL_CHANGE_STATES_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/config.hpp b/packages/PEGTL/include/tao/pegtl/config.hpp
index f1a3b8874f5ae5804a0ff926ce6d2d7ddf879787..a58f22eb00cf1947d12ad17c9a9644e2aa9c6ff9 100644
--- a/packages/PEGTL/include/tao/pegtl/config.hpp
+++ b/packages/PEGTL/include/tao/pegtl/config.hpp
@@ -1,10 +1,13 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONFIG_HPP
 #define TAO_PEGTL_CONFIG_HPP
 
-#if !defined( TAO_PEGTL_NAMESPACE )
+#if defined( TAO_PEGTL_NAMESPACE )
+#pragma message( "TAO_PEGTL_NAMESPACE is deprecated" )
+#else
 #define TAO_PEGTL_NAMESPACE tao::pegtl
 #endif
 
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/abnf.hpp b/packages/PEGTL/include/tao/pegtl/contrib/abnf.hpp
index df5f39daf36cee116e5b1057616b26b87f60e079..6778f7a10f9b481ab7403aa2e0974e6e781815e4 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/abnf.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/abnf.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_ABNF_HPP
 #define TAO_PEGTL_CONTRIB_ABNF_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/add_state.hpp b/packages/PEGTL/include/tao/pegtl/contrib/add_state.hpp
index fc3c46fbfa069f60073b2abe9c9bde3b6721a4b6..be91cd89cb1bf9e96112a8e4b1754410415e3686 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/add_state.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/add_state.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_ADD_STATE_HPP
 #define TAO_PEGTL_CONTRIB_ADD_STATE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/alphabet.hpp b/packages/PEGTL/include/tao/pegtl/contrib/alphabet.hpp
index 1a9c24dbf276f8d8144a83f18db38a7fbd99f15f..798530e05af4d18eb04407412051dd6d6376bf35 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/alphabet.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/alphabet.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2015-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_ALPHABET_HPP
 #define TAO_PEGTL_CONTRIB_ALPHABET_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/analyze.hpp b/packages/PEGTL/include/tao/pegtl/contrib/analyze.hpp
index 17c2a58df9bfca9336cb99876f844ad9fa2fe312..6b4b093b720cb5b2be239b7d1ddef9ee253506bb 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/analyze.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/analyze.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_ANALYZE_HPP
 #define TAO_PEGTL_CONTRIB_ANALYZE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/analyze_traits.hpp b/packages/PEGTL/include/tao/pegtl/contrib/analyze_traits.hpp
index 1d82cf643997e834424405aab3994c5be3990143..b835098fbc5076cb2801aa8c2b825c6db0263a5f 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/analyze_traits.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/analyze_traits.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_ANALYZE_TRAITS_HPP
 #define TAO_PEGTL_CONTRIB_ANALYZE_TRAITS_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/check_bytes.hpp b/packages/PEGTL/include/tao/pegtl/contrib/check_bytes.hpp
index 84333c7d7ab6425ab0b923b88eb68d9046cc5668..cfd313db7d7c148e4a7f366a23a895a220bcdadc 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/check_bytes.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/check_bytes.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_CHECK_BYTES_HPP
 #define TAO_PEGTL_CONTRIB_CHECK_BYTES_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/control_action.hpp b/packages/PEGTL/include/tao/pegtl/contrib/control_action.hpp
index 06d61e29591f1a2ae6d969355a92ee64548d4154..239e6213bdabbb0d11bcc468dbc8cb27da574586 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/control_action.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/control_action.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_CONTROL_ACTION_HPP
 #define TAO_PEGTL_CONTRIB_CONTROL_ACTION_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/coverage.hpp b/packages/PEGTL/include/tao/pegtl/contrib/coverage.hpp
index 14d32ffcc133bfb3197c94571087b1ee083fd1c3..bfa75424fbe26d170b58c69c5f84ddc25b73795c 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/coverage.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/coverage.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_COVERAGE_HPP
 #define TAO_PEGTL_CONTRIB_COVERAGE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/forward.hpp b/packages/PEGTL/include/tao/pegtl/contrib/forward.hpp
index d60f9b54b8ee1ae3e32ddbe1d7354f6f0c43a415..aa9f351e4e2956fc7f11eb719d695a117439c0ec 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/forward.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/forward.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_FORWARD_HPP
 #define TAO_PEGTL_CONTRIB_FORWARD_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/function.hpp b/packages/PEGTL/include/tao/pegtl/contrib/function.hpp
index bed10604790529ef356ee5ce3da03966d62ad6fc..86d535896d4cef96d48e1b7b071060078032786d 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/function.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/function.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_FUNCTION_HPP
 #define TAO_PEGTL_CONTRIB_FUNCTION_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/http.hpp b/packages/PEGTL/include/tao/pegtl/contrib/http.hpp
index 1755f5469eecbf6564deb9460f0264df3120f416..e794a6dd739a803d4a323728d87daa77fbfe3426 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/http.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/http.hpp
@@ -1,11 +1,12 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_HTTP_HPP
 #define TAO_PEGTL_CONTRIB_HTTP_HPP
 
 #if !defined( __cpp_exceptions )
-#error "Exception support required tao/pegtl/contrib/http.hpp"
+#error "Exception support required for tao/pegtl/contrib/http.hpp"
 #else
 
 #include "../ascii.hpp"
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/icu/internal.hpp b/packages/PEGTL/include/tao/pegtl/contrib/icu/internal.hpp
index 596a7eced7aaa70885965a69577ce41990cbfa33..02976fd6d210a221d6f799aa72e8ef651bda4dd7 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/icu/internal.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/icu/internal.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_ICU_INTERNAL_HPP
 #define TAO_PEGTL_CONTRIB_ICU_INTERNAL_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/icu/utf16.hpp b/packages/PEGTL/include/tao/pegtl/contrib/icu/utf16.hpp
index b73db99271631eb58c5220e02e70607733bad7d0..1b04ad8b2ec60a361ce8d32395c85c4e1332dc21 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/icu/utf16.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/icu/utf16.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_ICU_UTF16_HPP
 #define TAO_PEGTL_CONTRIB_ICU_UTF16_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/icu/utf32.hpp b/packages/PEGTL/include/tao/pegtl/contrib/icu/utf32.hpp
index 9781f567b35e6a58add6a634120db7850f1a3559..738b0e61f1dfe1f9c9d09c17e1e5e80911670908 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/icu/utf32.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/icu/utf32.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_ICU_UTF32_HPP
 #define TAO_PEGTL_CONTRIB_ICU_UTF32_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/icu/utf8.hpp b/packages/PEGTL/include/tao/pegtl/contrib/icu/utf8.hpp
index cf72da60ac3724a97dca3092db793db72255f0f8..7a7ced60128523593bc951ec037f3dfdddf603d7 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/icu/utf8.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/icu/utf8.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_ICU_UTF8_HPP
 #define TAO_PEGTL_CONTRIB_ICU_UTF8_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/if_then.hpp b/packages/PEGTL/include/tao/pegtl/contrib/if_then.hpp
index a17801355f5c07a87fcac9a0fb4836af2be2a42e..897f55c026d46394c3a63e0bbcb65083fd01bb0d 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/if_then.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/if_then.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_IF_THEN_HPP
 #define TAO_PEGTL_CONTRIB_IF_THEN_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/instantiate.hpp b/packages/PEGTL/include/tao/pegtl/contrib/instantiate.hpp
index ebf6e7f1ca1730a47617ccaea29d565270b5a888..ce8bb4c293fe9ef45f85d1919d49fe4cdfd4b4d3 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/instantiate.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/instantiate.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_INSTANTIATE_HPP
 #define TAO_PEGTL_CONTRIB_INSTANTIATE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/integer.hpp b/packages/PEGTL/include/tao/pegtl/contrib/integer.hpp
index b8edf793cc7bb528fdbb9208a4604dceafb4c194..2fbc7285dcde90629456db3a5c334ffb5e93e94f 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/integer.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/integer.hpp
@@ -1,11 +1,12 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_INTEGER_HPP
 #define TAO_PEGTL_CONTRIB_INTEGER_HPP
 
 #if !defined( __cpp_exceptions )
-#error "Exception support required tao/pegtl/contrib/integer.hpp"
+#error "Exception support required for tao/pegtl/contrib/integer.hpp"
 #else
 
 #include <cstdint>
@@ -66,7 +67,7 @@ namespace TAO_PEGTL_NAMESPACE
          return ( '0' <= c ) && ( c <= '9' );
       }
 
-      template< typename Integer, Integer Maximum = (std::numeric_limits< Integer >::max)() >
+      template< typename Integer, Integer Maximum = ( std::numeric_limits< Integer >::max )() >
       [[nodiscard]] constexpr bool accumulate_digit( Integer& result, const char digit ) noexcept
       {
          // Assumes that digit is a digit as per is_digit(); returns false on overflow.
@@ -86,7 +87,7 @@ namespace TAO_PEGTL_NAMESPACE
          return true;
       }
 
-      template< typename Integer, Integer Maximum = (std::numeric_limits< Integer >::max)() >
+      template< typename Integer, Integer Maximum = ( std::numeric_limits< Integer >::max )() >
       [[nodiscard]] constexpr bool accumulate_digits( Integer& result, const std::string_view input ) noexcept
       {
          // Assumes input is a non-empty sequence of digits; returns false on overflow.
@@ -99,7 +100,7 @@ namespace TAO_PEGTL_NAMESPACE
          return true;
       }
 
-      template< typename Integer, Integer Maximum = (std::numeric_limits< Integer >::max)() >
+      template< typename Integer, Integer Maximum = ( std::numeric_limits< Integer >::max )() >
       [[nodiscard]] constexpr bool convert_positive( Integer& result, const std::string_view input ) noexcept
       {
          // Assumes result == 0 and that input is a non-empty sequence of digits; returns false on overflow.
@@ -115,7 +116,7 @@ namespace TAO_PEGTL_NAMESPACE
 
          static_assert( std::is_signed_v< Signed > );
          using Unsigned = std::make_unsigned_t< Signed >;
-         constexpr Unsigned maximum = static_cast< Unsigned >( (std::numeric_limits< Signed >::max)() ) + 1;
+         constexpr Unsigned maximum = static_cast< Unsigned >( ( std::numeric_limits< Signed >::max )() ) + 1;
          Unsigned temporary = 0;
          if( accumulate_digits< Unsigned, maximum >( temporary, input ) ) {
             result = static_cast< Signed >( ~temporary ) + 1;
@@ -124,7 +125,7 @@ namespace TAO_PEGTL_NAMESPACE
          return false;
       }
 
-      template< typename Unsigned, Unsigned Maximum = (std::numeric_limits< Unsigned >::max)() >
+      template< typename Unsigned, Unsigned Maximum = ( std::numeric_limits< Unsigned >::max )() >
       [[nodiscard]] constexpr bool convert_unsigned( Unsigned& result, const std::string_view input ) noexcept
       {
          // Assumes result == 0 and that input is a non-empty sequence of digits; returns false on overflow.
@@ -167,7 +168,7 @@ namespace TAO_PEGTL_NAMESPACE
 
       template< typename ParseInput,
                 typename Unsigned,
-                Unsigned Maximum = (std::numeric_limits< Unsigned >::max)() >
+                Unsigned Maximum = ( std::numeric_limits< Unsigned >::max )() >
       [[nodiscard]] bool match_and_convert_unsigned_with_maximum_throws( ParseInput& in, Unsigned& st )
       {
          // Assumes st == 0.
@@ -193,7 +194,7 @@ namespace TAO_PEGTL_NAMESPACE
 
       template< typename ParseInput,
                 typename Unsigned,
-                Unsigned Maximum = (std::numeric_limits< Unsigned >::max)() >
+                Unsigned Maximum = ( std::numeric_limits< Unsigned >::max )() >
       [[nodiscard]] bool match_and_convert_unsigned_with_maximum_nothrow( ParseInput& in, Unsigned& st )
       {
          // Assumes st == 0.
@@ -304,7 +305,7 @@ namespace TAO_PEGTL_NAMESPACE
       }
    };
 
-   template< typename Unsigned, Unsigned Maximum = (std::numeric_limits< Unsigned >::max)() >
+   template< typename Unsigned, Unsigned Maximum = ( std::numeric_limits< Unsigned >::max )() >
    struct maximum_rule
    {
       using rule_t = maximum_rule;
@@ -320,7 +321,7 @@ namespace TAO_PEGTL_NAMESPACE
       }
    };
 
-   template< typename Unsigned, Unsigned Maximum = (std::numeric_limits< Unsigned >::max)() >
+   template< typename Unsigned, Unsigned Maximum = ( std::numeric_limits< Unsigned >::max )() >
    struct maximum_rule_with_action
    {
       using rule_t = maximum_rule_with_action;
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/internal/endian.hpp b/packages/PEGTL/include/tao/pegtl/contrib/internal/endian.hpp
index ac40ead5894cab5d0004b773fcf88ebeb86e80d0..fc997a311651a584583f72198ab325d158406cec 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/internal/endian.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/internal/endian.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_INTERNAL_ENDIAN_HPP
 #define TAO_PEGTL_CONTRIB_INTERNAL_ENDIAN_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/internal/endian_gcc.hpp b/packages/PEGTL/include/tao/pegtl/contrib/internal/endian_gcc.hpp
index 9eeb9ad71bffc2ad171ed00757dd793bad9014b1..d6a82315520517ef535f0a947284169376529dd3 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/internal/endian_gcc.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/internal/endian_gcc.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_INTERNAL_ENDIAN_GCC_HPP
 #define TAO_PEGTL_CONTRIB_INTERNAL_ENDIAN_GCC_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/internal/endian_win.hpp b/packages/PEGTL/include/tao/pegtl/contrib/internal/endian_win.hpp
index 308dd3f018361baeebde0cf080ae49406ebfbace..ac21c8bcc900a3a47b48ef5e9c6a029f61b45839 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/internal/endian_win.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/internal/endian_win.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_INTERNAL_ENDIAN_WIN_HPP
 #define TAO_PEGTL_CONTRIB_INTERNAL_ENDIAN_WIN_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/internal/peek_mask_uint.hpp b/packages/PEGTL/include/tao/pegtl/contrib/internal/peek_mask_uint.hpp
index 2a6c8b3079d8661e76fe2bf385ea8ed908649fb9..f34baa4f708fd0e06424b8100b6d98aea5a6117c 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/internal/peek_mask_uint.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/internal/peek_mask_uint.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_INTERNAL_PEEK_MASK_UINT_HPP
 #define TAO_PEGTL_CONTRIB_INTERNAL_PEEK_MASK_UINT_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/internal/peek_mask_uint8.hpp b/packages/PEGTL/include/tao/pegtl/contrib/internal/peek_mask_uint8.hpp
index 15bd6926f26467ff01c883e24b18d72d181c7bc6..b57da1809cb774ed26c2f5339a8e04293bf36ebf 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/internal/peek_mask_uint8.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/internal/peek_mask_uint8.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_INTERNAL_PEEK_MASK_UINT8_HPP
 #define TAO_PEGTL_CONTRIB_INTERNAL_PEEK_MASK_UINT8_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/internal/peek_uint.hpp b/packages/PEGTL/include/tao/pegtl/contrib/internal/peek_uint.hpp
index 04925f5ac3480a7176645cb412d4777c52b68e6f..62f7555b666c03a01384e9051cfcc1a99f1e8f12 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/internal/peek_uint.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/internal/peek_uint.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_INTERNAL_PEEK_UINT_HPP
 #define TAO_PEGTL_CONTRIB_INTERNAL_PEEK_UINT_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/internal/peek_uint8.hpp b/packages/PEGTL/include/tao/pegtl/contrib/internal/peek_uint8.hpp
index 47b78efb2549146a425849a4537478d76a8de1af..9b437f4c650d827ca9b0e3c3681d1dd62db0093c 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/internal/peek_uint8.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/internal/peek_uint8.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_INTERNAL_PEEK_UINT8_HPP
 #define TAO_PEGTL_CONTRIB_INTERNAL_PEEK_UINT8_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/internal/peek_utf16.hpp b/packages/PEGTL/include/tao/pegtl/contrib/internal/peek_utf16.hpp
index c37b23918b7dad49dbd0d7bc7294a06b018ff25e..6a5006c216e847d993b5f56b06c9459cfc2dfb26 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/internal/peek_utf16.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/internal/peek_utf16.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_INTERNAL_PEEK_UTF16_HPP
 #define TAO_PEGTL_CONTRIB_INTERNAL_PEEK_UTF16_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/internal/peek_utf32.hpp b/packages/PEGTL/include/tao/pegtl/contrib/internal/peek_utf32.hpp
index b48dc4d2c2b0fc96152b8aa4f17fae28c0ad52d4..0d78db0dacf6cc8694a32e1d960362831f314ebe 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/internal/peek_utf32.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/internal/peek_utf32.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_INTERNAL_PEEK_UTF32_HPP
 #define TAO_PEGTL_CONTRIB_INTERNAL_PEEK_UTF32_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/internal/read_uint.hpp b/packages/PEGTL/include/tao/pegtl/contrib/internal/read_uint.hpp
index 184c55ee095eebad005a1719b216ed1fdf9511ae..8f294726aa7b64d9b6f358ddd8148cc40d85c900 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/internal/read_uint.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/internal/read_uint.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_INTERNAL_READ_UINT_HPP
 #define TAO_PEGTL_CONTRIB_INTERNAL_READ_UINT_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/internal/set_stack_guard.hpp b/packages/PEGTL/include/tao/pegtl/contrib/internal/set_stack_guard.hpp
index c0071e959d0c3bd52f405b5800e608107716a91c..a7c89b5ec3c86fc76a55f228c667fb3f35960fff 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/internal/set_stack_guard.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/internal/set_stack_guard.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_INTERNAL_SET_STACK_GUARD_HPP
 #define TAO_PEGTL_CONTRIB_INTERNAL_SET_STACK_GUARD_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/internal/vector_stack_guard.hpp b/packages/PEGTL/include/tao/pegtl/contrib/internal/vector_stack_guard.hpp
index 4b0cfbe8bf730bdac56b39c6e09cd627b5f928b5..b143457f4ff39776f9101b5284a16b5d53b22e29 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/internal/vector_stack_guard.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/internal/vector_stack_guard.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_INTERNAL_VECTOR_STACK_GUARD_HPP
 #define TAO_PEGTL_CONTRIB_INTERNAL_VECTOR_STACK_GUARD_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/iri.hpp b/packages/PEGTL/include/tao/pegtl/contrib/iri.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..defdc42fa0d72b1725e97c1c4bf497a8af64e84d
--- /dev/null
+++ b/packages/PEGTL/include/tao/pegtl/contrib/iri.hpp
@@ -0,0 +1,107 @@
+// Copyright (c) 2021 Kelvin Hammond
+// Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef TAO_PEGTL_CONTRIB_IRI_HPP
+#define TAO_PEGTL_CONTRIB_IRI_HPP
+
+#if !defined( __cpp_exceptions )
+#error "Exception support required for tao/pegtl/contrib/iri.hpp"
+#else
+
+#include "../config.hpp"
+#include "../rules.hpp"
+#include "../utf8.hpp"
+
+#include "abnf.hpp"
+#include "uri.hpp"
+
+namespace TAO_PEGTL_NAMESPACE::iri
+{
+   // IRI grammar according to RFC 3987.
+
+   // This grammar is a direct PEG translation of the original URI grammar.
+   // It should be considered experimental -- in case of any issues, in particular
+   // missing rules for attached actions, please contact the developers.
+
+   // Note that this grammar has multiple top-level rules.
+
+   using uri::scheme;
+   using uri::port;
+   using uri::dslash;
+   using uri::IP_literal;
+   using uri::IPv4address;
+   using uri::pct_encoded;
+   using uri::sub_delims;
+   using uri::colon;
+
+   // clang-format off
+   struct ucschar : utf8::ranges<
+      0xA0, 0xD7FF,
+      0xF900, 0xFDCF,
+      0xFDF0, 0xFFEF,
+      0x10000, 0x1FFFD,
+      0x20000, 0x2FFFD,
+      0x30000, 0x3FFFD,
+      0x40000, 0x4FFFD,
+      0x50000, 0x5FFFD,
+      0x60000, 0x6FFFD,
+      0x70000, 0x7FFFD,
+      0x80000, 0x8FFFD,
+      0x90000, 0x9FFFD,
+      0xA0000, 0xAFFFD,
+      0xB0000, 0xBFFFD,
+      0xC0000, 0xCFFFD,
+      0xD0000, 0xDFFFD,
+      0xE1000, 0xEFFFD > {};
+
+   struct iprivate : utf8::ranges< 0xE000, 0xF8FF, 0xF0000, 0xFFFFD, 0x100000, 0x10FFFD > {};
+
+   struct iunreserved : sor< abnf::ALPHA, abnf::DIGIT, one< '-', '.', '_', '~' >, ucschar > {};
+
+   struct ipchar : sor< iunreserved, pct_encoded, sub_delims, one< ':', '@' > > {};
+
+   struct isegment : star< ipchar > {};
+   struct isegment_nz : plus< ipchar > {};
+   // non-zero-length segment without any colon ":"
+   struct isegment_nz_nc : plus< sor< iunreserved, pct_encoded, sub_delims, one< '@' > > > {};
+
+   struct ipath_abempty : star< one< '/' >, isegment > {};
+   struct ipath_absolute : seq< one< '/' >, opt< isegment_nz, star< one< '/' >, isegment > > > {};
+   struct ipath_noscheme : seq< isegment_nz_nc, star< one< '/' >, isegment > > {};
+   struct ipath_rootless : seq< isegment_nz, star< one< '/' >, isegment > > {};
+   struct ipath_empty : success {};
+
+   struct ipath : sor< ipath_noscheme,  // begins with a non-colon segment
+                       ipath_rootless,  // begins with a segment
+                       ipath_absolute,  // begins with "/" but not "//"
+                       ipath_abempty >  // begins with "/" or is empty
+   {};
+
+   struct ireg_name : star< sor< iunreserved, pct_encoded, sub_delims > > {};
+
+   struct ihost : sor< IP_literal, IPv4address, ireg_name > {};
+   struct iuserinfo : star< sor< iunreserved, pct_encoded, sub_delims, colon > > {};
+   struct opt_iuserinfo : opt< iuserinfo, one< '@' > > {};
+   struct iauthority : seq< opt_iuserinfo, ihost, opt< colon, port > > {};
+
+   struct iquery : star< sor< ipchar, iprivate, one< '/', '?' > > > {};
+   struct ifragment : star< sor< ipchar, one< '/', '?' > > > {};
+
+   struct opt_iquery : opt_must< one< '?' >, iquery > {};
+   struct opt_ifragment : opt_must< one< '#' >, ifragment > {};
+
+   struct ihier_part : sor< if_must< dslash, iauthority, ipath_abempty >, ipath_rootless, ipath_absolute, ipath_empty > {};
+   struct irelative_part : sor< if_must< dslash, iauthority, ipath_abempty >, ipath_noscheme, ipath_absolute, ipath_empty > {};
+   struct irelative_ref : seq< irelative_part, opt_iquery, opt_ifragment > {};
+
+   struct IRI : seq< scheme, one< ':' >, ihier_part, opt_iquery, opt_ifragment > {};
+   struct IRI_reference : sor< IRI, irelative_ref > {};
+   struct absolute_IRI : seq< scheme, one< ':' >, ihier_part, opt_iquery > {};
+   // clang-format off
+
+}  // namespace TAO_PEGTL_NAMESPACE::iri
+
+#endif
+#endif
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/json.hpp b/packages/PEGTL/include/tao/pegtl/contrib/json.hpp
index 027201ac88a1691e6fb5195389dd2afc907b0d40..e47aaae4b5b01bb22bd1212781aa6fdd8c347559 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/json.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/json.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_JSON_HPP
 #define TAO_PEGTL_CONTRIB_JSON_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/json_pointer.hpp b/packages/PEGTL/include/tao/pegtl/contrib/json_pointer.hpp
index 6230cfcea1263d460c76ee7d6ef4651012bf4243..5e920d720b1e0d2b82dc77f4252f965eb67e4db2 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/json_pointer.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/json_pointer.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_JSON_POINTER_HPP
 #define TAO_PEGTL_CONTRIB_JSON_POINTER_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/limit_bytes.hpp b/packages/PEGTL/include/tao/pegtl/contrib/limit_bytes.hpp
index cea99fc6fef85375352e6a377c9db42d19201ff5..1447e1fd58fdd2a964bcdcc1263ed90696a27005 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/limit_bytes.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/limit_bytes.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_LIMIT_BYTES_HPP
 #define TAO_PEGTL_CONTRIB_LIMIT_BYTES_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/limit_depth.hpp b/packages/PEGTL/include/tao/pegtl/contrib/limit_depth.hpp
index 447bebff7cc21a78f631e37ac23800a2741fd671..a84e002eaed39f027d4c7f2b60a9c8ad27689cd3 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/limit_depth.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/limit_depth.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_LIMIT_DEPTH_HPP
 #define TAO_PEGTL_CONTRIB_LIMIT_DEPTH_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/parse_tree.hpp b/packages/PEGTL/include/tao/pegtl/contrib/parse_tree.hpp
index 2d83d557e6874f07f114075c4101647b96f6893a..8de35b8cd13051ba046959e108933a6c160d484e 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/parse_tree.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/parse_tree.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_PARSE_TREE_HPP
 #define TAO_PEGTL_CONTRIB_PARSE_TREE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/parse_tree_to_dot.hpp b/packages/PEGTL/include/tao/pegtl/contrib/parse_tree_to_dot.hpp
index 79dbb0104e0b51e8d2970e7c2f879f2e33199836..f186bf6ea0c559fb6d8e38c2fe1afb2d1c085b89 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/parse_tree_to_dot.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/parse_tree_to_dot.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_PARSE_TREE_TO_DOT_HPP
 #define TAO_PEGTL_CONTRIB_PARSE_TREE_TO_DOT_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/peg.hpp b/packages/PEGTL/include/tao/pegtl/contrib/peg.hpp
deleted file mode 100644
index 3c15c9b721ac6258414c2f14ff538dbd400afa3f..0000000000000000000000000000000000000000
--- a/packages/PEGTL/include/tao/pegtl/contrib/peg.hpp
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright (c) 2021 Daniel Deptford
-// Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
-
-#ifndef TAO_PEGTL_CONTRIB_PEG_HPP
-#define TAO_PEGTL_CONTRIB_PEG_HPP
-
-#include <tao/pegtl.hpp>
-
-namespace TAO_PEGTL_NAMESPACE::peg
-{
-   // PEG grammar from https://pdos.csail.mit.edu/~baford/packrat/popl04/peg-popl04.pdf
-   namespace grammar
-   {
-      // clang-format off
-      struct AND;
-      struct Char;
-      struct Class;
-      struct CLOSE;
-      struct Comment;
-      struct Definition;
-      struct DOT;
-      struct EndOfFile;
-      struct EndOfLine;
-      struct Expression;
-      struct QUESTION;
-      struct IdentCont;
-      struct Identifier;
-      struct IdentStart;
-      struct LEFTARROW;
-      struct Literal;
-      struct NOT;
-      struct OPEN;
-      struct PLUS;
-      struct Prefix;
-      struct Primary;
-      struct Range;
-      struct Sequence;
-      struct SLASH;
-      struct Space;
-      struct Spacing;
-      struct STAR;
-      struct Suffix;
-
-      struct Grammar : seq< Spacing, plus< Definition >, EndOfFile > {};
-
-      struct Definition : seq< Identifier, LEFTARROW, Expression > {};
-      struct Expression : list< Sequence, SLASH > {};
-      struct Sequence : star< Prefix > {};
-
-      struct Prefix : seq< opt< sor< AND, NOT > >, Suffix > {};
-      struct Suffix : seq< Primary, opt< sor< QUESTION, STAR, PLUS > > > {};
-
-      struct Primary : sor<
-         seq< Identifier, not_at< LEFTARROW > >,
-         seq< OPEN, Expression, CLOSE >,
-         Literal,
-         Class,
-         DOT
-         > {};
-
-      struct Identifier : seq< IdentStart, star< IdentCont >, Spacing > {};
-
-      struct IdentStart : identifier_first {};
-
-      struct IdentCont : identifier_other {};
-
-      struct Literal : sor<
-         seq< one< '\'' >, until< one< '\'' >, Char >, Spacing >,
-         seq< one< '"' >, until< one< '"' >, Char >, Spacing >
-         > {};
-
-      struct Class : seq< one< '[' >, until< one< ']' >, Range >, Spacing > {};
-
-      struct Range : sor<
-         seq< Char, one< '-' >, Char >,
-         Char
-         > {};
-
-      struct Char : sor<
-         seq<
-            one< '\\' >,
-            one< 'n', 'r', 't', '\'', '"', '[', ']', '\\' > >,
-         seq<
-            one< '\\' >,
-            range< '0', '2' >,
-            range< '0', '7' >,
-            range< '0', '7' > >,
-         seq<
-            one< '\\' >,
-            range< '0','7' >,
-            opt< range< '0','7' > > >,
-         seq<
-            not_at< one< '\\' > >,
-            any >
-         > {};
-
-      struct LEFTARROW : seq< string< '<','-' >, Spacing > {};
-      struct SLASH : seq< one< '/' >, Spacing > {};
-      struct AND : seq< one< '&' >, Spacing > {};
-      struct NOT : seq< one< '!' >, Spacing > {};
-      struct QUESTION : seq< one< '?' >, Spacing > {};
-      struct STAR : seq< one< '*' >, Spacing > {};
-      struct PLUS : seq< one< '+' >, Spacing > {};
-      struct OPEN : seq< one< '(' >, Spacing > {};
-      struct CLOSE : seq< one< ')' >, Spacing > {};
-      struct DOT : seq< one< '.' >, Spacing > {};
-
-      struct Spacing : star< sor< Space, Comment > > {};
-      struct Comment : seq< one< '#' >, until< EndOfLine > > {};
-
-      struct Space : sor< one< ' ', '\t' >, EndOfLine > {};
-      struct EndOfLine : sor< string< '\r', '\n' >, one< '\n' >, one< '\r' > > {};
-      struct EndOfFile : eof {};
-      // clang-format on
-
-   }  // namespace grammar
-
-}  // namespace TAO_PEGTL_NAMESPACE::peg
-
-#endif  // TAO_PEGTL_CONTRIB_PEG_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/predicates.hpp b/packages/PEGTL/include/tao/pegtl/contrib/predicates.hpp
index 35a1f9b2f71da1a558ca4118e0f427503a3d5ca7..4e37e29072324e5545ea1cf0dc67c70acdeba3dd 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/predicates.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/predicates.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_PREDICATES_HPP
 #define TAO_PEGTL_CONTRIB_PREDICATES_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/print.hpp b/packages/PEGTL/include/tao/pegtl/contrib/print.hpp
index 1583630f9ec62a1bcdb2a87b71f80c92ddfb8ae3..18d803021412de70c06aaac6bb698d959add68b1 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/print.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/print.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_PRINT_HPP
 #define TAO_PEGTL_CONTRIB_PRINT_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/print_coverage.hpp b/packages/PEGTL/include/tao/pegtl/contrib/print_coverage.hpp
index 98be601d0d47a190df4383fb74ff0f27372c5a9c..9b0774f0467249aa09eb42716112845845aea6ae 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/print_coverage.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/print_coverage.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_PRINT_COVERAGE_HPP
 #define TAO_PEGTL_CONTRIB_PRINT_COVERAGE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/proto3.hpp b/packages/PEGTL/include/tao/pegtl/contrib/proto3.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..535cfa1c327da830d203f2e526d5d10aea3e0dcb
--- /dev/null
+++ b/packages/PEGTL/include/tao/pegtl/contrib/proto3.hpp
@@ -0,0 +1,143 @@
+// Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef TAO_PEGTL_CONTRIB_PROTO3_HPP
+#define TAO_PEGTL_CONTRIB_PROTO3_HPP
+
+#include "../ascii.hpp"
+#include "../config.hpp"
+#include "../rules.hpp"
+
+namespace TAO_PEGTL_NAMESPACE::proto3
+{
+   // protocol buffer v3
+   // https://developers.google.com/protocol-buffers/docs/reference/proto3-spec
+
+   // clang-format off
+   struct comment_sl : seq< two< '/' >, until< eolf > > {};
+   struct comment_ml : if_must< string< '/', '*' >, until< string< '*', '/' > > > {};
+   struct sp : sor< space, comment_sl, comment_ml > {};
+   struct sps : star< sp > {};
+
+   struct comma : one< ',' > {};
+   struct dot : one< '.' > {};
+   struct equ : one< '=' > {};
+   struct semi : one< ';' > {};
+
+   struct option;
+   struct message;
+   struct extend;
+
+   struct ident_first : ranges< 'a', 'z', 'A', 'Z' > {};  // NOTE: Yes, no '_'.
+   struct ident_other : ranges< 'a', 'z', 'A', 'Z', '0', '9', '_' > {};
+   struct ident : seq< ident_first, star< ident_other > > {};
+   struct full_ident : list_must< ident, dot > {};
+
+   struct hex_lit : seq< one< '0' >, one< 'x', 'X' >, plus< xdigit > > {};
+   struct oct_lit : seq< one< '0' >, plus< odigit > > {};
+   struct dec_lit : seq< range< '1', '9' >, star< digit > >  {};
+   struct int_lit : sor< hex_lit, oct_lit, dec_lit > {};
+
+   struct sign : one< '+', '-' > {};
+   struct exp : seq< one< 'E', 'e' >, opt< sign >, plus< digit > > {};
+   struct float_lit : sor<
+      seq< plus< digit >, dot, exp >,
+      seq< plus< digit >, dot, star< digit >, opt< exp > >,
+      seq< dot, plus< digit >, opt< exp > >,
+      keyword< 'i', 'n', 'f' >,
+      keyword< 'n', 'a', 'n' > > {};
+
+   struct bool_lit : sor< keyword< 't', 'r', 'u', 'e' >,
+                          keyword< 'f', 'a', 'l', 's', 'e' > > {};
+
+   struct hex_escape : if_must< one< 'x', 'X' >, xdigit, xdigit > {};
+   struct oct_escape : if_must< odigit, odigit, odigit > {};
+   struct char_escape : one< 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"' > {};
+   struct escape : if_must< one< '\\' >, hex_escape, oct_escape, char_escape > {};
+   struct char_value : sor< escape, not_one< '\n', '\0' > > {};  // NOTE: No need to exclude '\' from not_one<>, see escape rule.
+   template< char Q >
+   struct str_impl : if_must< one< Q >, until< one< Q >, char_value > > {};
+   struct str_lit : sor< str_impl< '\'' >, str_impl< '"' > > {};
+
+   struct constant : sor< bool_lit, seq< opt< sign >, float_lit >, seq< opt< sign >, int_lit >, str_lit, full_ident > {};
+
+   struct option_name : seq< sor< ident, if_must< one< '(' >, full_ident, one< ')' > > >, star_must< dot, ident > > {};
+   struct option : if_must< keyword< 'o', 'p', 't', 'i', 'o', 'n' >, sps, option_name, sps, equ, sps, constant, sps, semi > {};
+
+   struct bool_type : keyword< 'b', 'o', 'o', 'l' > {};
+   struct bytes_type : keyword< 'b', 'y', 't', 'e', 's' > {};
+   struct double_type : keyword< 'd', 'o', 'u', 'b', 'l', 'e' > {};
+   struct float_type : keyword< 'f', 'l', 'o', 'a', 't' > {};
+   struct string_type : keyword< 's', 't', 'r', 'i', 'n', 'g' > {};
+
+   struct int32_type : keyword< 'i', 'n', 't', '3', '2' > {};
+   struct int64_type : keyword< 'i', 'n', 't', '6', '4' > {};
+   struct sint32_type : keyword< 's', 'i', 'n', 't', '3', '2' > {};
+   struct sint64_type : keyword< 's', 'i', 'n', 't', '6', '4' > {};
+   struct uint32_type : keyword< 'u', 'i', 'n', 't', '3', '2' > {};
+   struct uint64_type : keyword< 'u', 'i', 'n', 't', '6', '4' > {};
+   struct fixed32_type : keyword< 'f', 'i', 'x', 'e', 'd', '3', '2' > {};
+   struct fixed64_type : keyword< 'f', 'i', 'x', 'e', 'd', '6', '4' > {};
+   struct sfixed32_type : keyword< 's', 'f', 'i', 'x', 'e', 'd', '3', '2' > {};
+   struct sfixed64_type : keyword< 's', 'f', 'i', 'x', 'e', 'd', '6', '4' > {};
+
+   struct builtin_type : sor< bool_type, bytes_type, double_type, float_type, string_type, int32_type, int64_type, sint32_type, sint64_type, uint32_type, uint64_type, fixed32_type, fixed64_type, sfixed32_type, sfixed64_type > {};
+
+   struct defined_type : seq< opt< dot >, full_ident > {};  // NOTE: This replaces both message_type and enum_type -- they have the same syntax.
+
+   struct type : sor< builtin_type, defined_type > {};
+
+   struct field_option : if_must< option_name, sps, equ, sps, constant > {};
+   struct field_options : if_must< one< '[' >, sps, list< field_option, comma, sp >, sps, one< ']' > > {};
+   struct field_name : ident {};
+   struct field_number : int_lit {};
+   struct field : seq< opt< sor< keyword< 'o', 'p', 't', 'i', 'o', 'n', 'a', 'l' >, keyword< 'r', 'e', 'p', 'e', 'a', 't', 'e', 'd' > >, sps >, type, sps, field_name, sps, equ, sps, field_number, sps, opt< field_options, sps >, semi > {};
+
+   struct oneof_name : ident {};
+   struct oneof_field : if_must< type, sps, field_name, sps, equ, sps, field_number, sps, opt< field_options, sps >, semi > {};
+   struct oneof_body : sor< oneof_field, semi > {};
+   struct oneof : if_must< keyword< 'o', 'n', 'e', 'o', 'f' >, sps, oneof_name, sps, one< '{' >, sps, until< one< '}' >, oneof_body, sps >, sps > {};
+
+   struct key_type : seq< sor< bool_type, string_type, int32_type, int64_type, sint32_type, sint64_type, uint32_type, uint64_type, fixed32_type, fixed64_type, sfixed32_type, sfixed64_type >, not_at< ident_other > > {};
+   struct map_name : ident {};
+   struct map_field : if_must< keyword< 'm', 'a', 'p' >, sps, one< '<' >, sps, key_type, sps, comma, sps, type, sps, one< '>' >, sps, map_name, sps, equ, sps, field_number, sps, opt< field_options, sps >, semi > {};
+
+   struct range : if_must< int_lit, sps, keyword< 't', 'o' >, sps, sor< int_lit, keyword< 'm', 'a', 'x' > > > {};
+   struct ranges : list_must< range, comma, sp > {};
+   struct field_names : list_must< field_name, comma, sp > {};
+   struct reserved : if_must< keyword< 'r', 'e', 's', 'e', 'r', 'v', 'e', 'd' >, sps, sor< ranges, field_names >, sps, semi > {};
+
+   struct enum_name : ident {};
+   struct enum_value_option : seq< option_name, sps, equ, sps, constant > {};
+   struct enum_field : seq< ident, sps, equ, sps, int_lit, sps, opt_must< one< '[' >, sps, list_must< enum_value_option, comma, sp >, sps, one< ']' >, sps >, semi > {};
+   struct enum_body : if_must< one< '{' >, sps, star< sor< option, enum_field, semi >, sps >, one< '}' > > {};
+   struct enum_def : if_must< keyword< 'e', 'n', 'u', 'm' >, sps, enum_name, sps, enum_body > {};
+
+   struct message_thing : sor< field, enum_def, message, option, oneof, map_field, reserved, extend, semi > {};
+   struct message_body : seq< one<'{'>, sps, star< message_thing, sps >, one<'}'> > {};
+   struct message : if_must< keyword< 'm', 'e', 's', 's', 'a', 'g', 'e' >, sps, defined_type, sps, message_body > {};
+   struct extend : if_must< keyword< 'e', 'x', 't', 'e', 'n', 'd' >, sps, defined_type, sps, message_body > {};
+
+   struct package : if_must< keyword< 'p', 'a', 'c', 'k', 'a', 'g', 'e' >, sps, full_ident, sps, semi > {};
+
+   struct import_option : opt< sor< keyword< 'w', 'e', 'a', 'k' >, keyword< 'p', 'u', 'b', 'l', 'i', 'c' > > > {};
+   struct import : if_must< keyword< 'i', 'm', 'p', 'o', 'r', 't' >, sps, import_option, sps, str_lit, sps, semi > {};
+
+   struct rpc_name : ident {};
+   struct rpc_type : if_must< one< '(' >, sps, opt< keyword< 's', 't', 'r', 'e', 'a', 'm' >, sps >, defined_type, sps, one< ')' > > {};
+   struct rpc_options : if_must< one< '{' >, sps, star< sor< option, semi >, sps >, one< '}' > > {};
+   struct rpc : if_must< keyword< 'r', 'p', 'c' >, sps, rpc_name, sps, rpc_type, sps, keyword< 'r', 'e', 't', 'u', 'r', 'n', 's' >, sps, rpc_type, sps, sor< semi, rpc_options > > {};
+   struct service_name : ident {};
+   struct service : if_must< keyword< 's', 'e', 'r', 'v', 'i', 'c', 'e' >, sps, service_name, sps, one< '{' >, sps, star< sor< option, rpc, semi >, sps >, one< '}' > > {};
+
+   struct body : sor< import, package, option, message, enum_def, service, extend, semi > {};
+
+   struct quote : one< '\'', '"' > {};
+   struct head : if_must< keyword< 's', 'y', 'n', 't', 'a', 'x' >, sps, equ, sps, quote, string< 'p', 'r', 'o', 't', 'o', '3' >, quote, sps, semi > {};
+   struct proto : must< sps, head, sps, star< body, sps >, eof > {};
+   // clang-format on
+
+}  // namespace TAO_PEGTL_NAMESPACE::proto3
+
+#endif
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/raw_string.hpp b/packages/PEGTL/include/tao/pegtl/contrib/raw_string.hpp
index 819fb6248681a9b4e3dc95f213a914f00492676b..e24c578bcbbe4e7cc49181af95a9b465986bd79a 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/raw_string.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/raw_string.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_RAW_STRING_HPP
 #define TAO_PEGTL_CONTRIB_RAW_STRING_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/remove_first_state.hpp b/packages/PEGTL/include/tao/pegtl/contrib/remove_first_state.hpp
index 418347b757493e4acf8ea95e91209f3c5fa702d9..74384c63973b752330946ad98703b79aeea821a0 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/remove_first_state.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/remove_first_state.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_REMOVE_FIRST_STATE_HPP
 #define TAO_PEGTL_CONTRIB_REMOVE_FIRST_STATE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/remove_last_states.hpp b/packages/PEGTL/include/tao/pegtl/contrib/remove_last_states.hpp
index 8c6246407ff22fc68cf27f0150e48a9943272c71..5a4d67ecc2bf9481cc3a81e27ce7fe8328826662 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/remove_last_states.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/remove_last_states.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_REMOVE_LAST_STATES_HPP
 #define TAO_PEGTL_CONTRIB_REMOVE_LAST_STATES_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/rep_one_min_max.hpp b/packages/PEGTL/include/tao/pegtl/contrib/rep_one_min_max.hpp
index 13df3b3faa63d74d1f50e2de9adaa4fb21a2a0a5..785e46f98004242ba1202424ce5d733b931e9055 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/rep_one_min_max.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/rep_one_min_max.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_REP_ONE_MIN_MAX_HPP
 #define TAO_PEGTL_CONTRIB_REP_ONE_MIN_MAX_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/rep_string.hpp b/packages/PEGTL/include/tao/pegtl/contrib/rep_string.hpp
index d29040d4a20fdee9016f8a75f536cac3922c1c41..b42061667f759180af7dba159877423a3f226421 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/rep_string.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/rep_string.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_REP_STRING_HPP
 #define TAO_PEGTL_CONTRIB_REP_STRING_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/separated_seq.hpp b/packages/PEGTL/include/tao/pegtl/contrib/separated_seq.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..26aac6ab4e5ca78977ce19ba8de23a5224a11451
--- /dev/null
+++ b/packages/PEGTL/include/tao/pegtl/contrib/separated_seq.hpp
@@ -0,0 +1,46 @@
+// Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef TAO_PEGTL_CONTRIB_SEPARATED_SEQ_HPP
+#define TAO_PEGTL_CONTRIB_SEPARATED_SEQ_HPP
+
+#include "../config.hpp"
+
+#include "../internal/seq.hpp"
+#include "../type_list.hpp"
+
+namespace TAO_PEGTL_NAMESPACE
+{
+   namespace internal
+   {
+      template< typename... >
+      struct sep;
+
+      template< typename... Ts, typename S, typename Rule, typename... Rules >
+      struct sep< type_list< Ts... >, S, Rule, Rules... >
+         : sep< type_list< Ts..., Rule, S >, S, Rules... >
+      {};
+
+      template< typename... Ts, typename S, typename Rule >
+      struct sep< type_list< Ts... >, S, Rule >
+      {
+         using type = seq< Ts..., Rule >;
+      };
+
+      template< typename S >
+      struct sep< type_list<>, S >
+      {
+         using type = seq<>;
+      };
+
+   }  // namespace internal
+
+   template< typename S, typename... Rules >
+   struct separated_seq
+      : internal::sep< type_list<>, S, Rules... >::type
+   {};
+
+}  // namespace TAO_PEGTL_NAMESPACE
+
+#endif
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/shuffle_states.hpp b/packages/PEGTL/include/tao/pegtl/contrib/shuffle_states.hpp
index 31faaea9d4b1cab5fd2632df28a91def2f30620a..8cc969048b756db43f1d7059ade41a8dd1303167 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/shuffle_states.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/shuffle_states.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_SHUFFLE_STATES_HPP
 #define TAO_PEGTL_CONTRIB_SHUFFLE_STATES_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/state_control.hpp b/packages/PEGTL/include/tao/pegtl/contrib/state_control.hpp
index cb41b058c6bceb2c9022dc4dc5126b828d085316..a65a61b057b96a001777f60cd2c1227e922d7435 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/state_control.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/state_control.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_STATE_CONTROL_HPP
 #define TAO_PEGTL_CONTRIB_STATE_CONTROL_HPP
@@ -32,7 +33,8 @@ namespace TAO_PEGTL_NAMESPACE
                state.template start< Rule >( in, st... );
             }
 #if defined( _MSC_VER )
-            ( (void)st, ... );
+            ( (void)st,
+              ... );
 #endif
          }
 
@@ -46,7 +48,8 @@ namespace TAO_PEGTL_NAMESPACE
                Control< Rule >::success( in, st... );
             }
 #if defined( _MSC_VER )
-            ( (void)st, ... );
+            ( (void)st,
+              ... );
 #endif
          }
 
@@ -60,7 +63,8 @@ namespace TAO_PEGTL_NAMESPACE
                Control< Rule >::failure( in, st... );
             }
 #if defined( _MSC_VER )
-            ( (void)st, ... );
+            ( (void)st,
+              ... );
 #endif
          }
 
@@ -84,7 +88,8 @@ namespace TAO_PEGTL_NAMESPACE
                Control< Rule >::unwind( in, st... );
             }
 #if defined( _MSC_VER )
-            ( (void)st, ... );
+            ( (void)st,
+              ... );
 #endif
          }
 
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/to_string.hpp b/packages/PEGTL/include/tao/pegtl/contrib/to_string.hpp
index 987f02875f6d45f1ccb784fbd75a631896d85e81..1075e897efe9512e53245ea9ace873e5fad48c31 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/to_string.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/to_string.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_TO_STRING_HPP
 #define TAO_PEGTL_CONTRIB_TO_STRING_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/trace.hpp b/packages/PEGTL/include/tao/pegtl/contrib/trace.hpp
index d1ac4380b25d2494cb824ec06b18c853ea468ae6..751ceb50a341b597bc8d2b266c014a505db82b19 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/trace.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/trace.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_TRACE_HPP
 #define TAO_PEGTL_CONTRIB_TRACE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/uint16.hpp b/packages/PEGTL/include/tao/pegtl/contrib/uint16.hpp
index c3c1bf1e35925a292841578cb299cb119a7dd6af..1f7532861aa9c65b5a5e4570db86c0cc8f85f4a7 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/uint16.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/uint16.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_UINT16_HPP
 #define TAO_PEGTL_CONTRIB_UINT16_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/uint32.hpp b/packages/PEGTL/include/tao/pegtl/contrib/uint32.hpp
index 39a46c32230c8d000a8bbeaf1f3ca0b878cece2c..8bd88045bfa85c69a3dab544ecfc562f5201819d 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/uint32.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/uint32.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_UINT32_HPP
 #define TAO_PEGTL_CONTRIB_UINT32_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/uint64.hpp b/packages/PEGTL/include/tao/pegtl/contrib/uint64.hpp
index 3084a59d034f6264c668ebf0806bf883ccf7846b..366c9f4a6e95b7547f21d207495bdd19cb5c7a07 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/uint64.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/uint64.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_UINT64_HPP
 #define TAO_PEGTL_CONTRIB_UINT64_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/uint8.hpp b/packages/PEGTL/include/tao/pegtl/contrib/uint8.hpp
index f9e7dc7cd40fcc6e28a8d1eee284b9252f324cba..a5b325a298cea15ddac33de276364a14ae48c10b 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/uint8.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/uint8.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_UINT8_HPP
 #define TAO_PEGTL_CONTRIB_UINT8_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/unescape.hpp b/packages/PEGTL/include/tao/pegtl/contrib/unescape.hpp
index 5079f8275d72a178a5bd5527c3f85e12e8f9764e..7fb0c081ba054bce64d0d3fcd3ba3a7019f94b87 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/unescape.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/unescape.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_UNESCAPE_HPP
 #define TAO_PEGTL_CONTRIB_UNESCAPE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/uri.hpp b/packages/PEGTL/include/tao/pegtl/contrib/uri.hpp
index 701b2bb4102a6bdaa3d9b194cbd15ee5a74d4ed9..b019a7b36bb06e259cacf2016f2cddc8138a55e3 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/uri.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/uri.hpp
@@ -1,11 +1,12 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_URI_HPP
 #define TAO_PEGTL_CONTRIB_URI_HPP
 
 #if !defined( __cpp_exceptions )
-#error "Exception support required tao/pegtl/contrib/uri.hpp"
+#error "Exception support required for tao/pegtl/contrib/uri.hpp"
 #else
 
 #include <cstdint>
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/utf16.hpp b/packages/PEGTL/include/tao/pegtl/contrib/utf16.hpp
index 090b6b0615069e13990cb21a5fbcaf81aca14f8a..74915e16d44c3ca310e3717a894762dbe54d659b 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/utf16.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/utf16.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2015-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_UTF16_HPP
 #define TAO_PEGTL_CONTRIB_UTF16_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/contrib/utf32.hpp b/packages/PEGTL/include/tao/pegtl/contrib/utf32.hpp
index d8b80517a6b5bec4f56e9c085f40f165e45aa9bb..1201fca885e907e8fd2011ef6d48e20248ad5662 100644
--- a/packages/PEGTL/include/tao/pegtl/contrib/utf32.hpp
+++ b/packages/PEGTL/include/tao/pegtl/contrib/utf32.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CONTRIB_UTF32_HPP
 #define TAO_PEGTL_CONTRIB_UTF32_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/cstream_input.hpp b/packages/PEGTL/include/tao/pegtl/cstream_input.hpp
index 1496927b46cd78c85afeeee647a24798ec53a141..c589912261a99aeee6d9983c20be89f6a034b5d9 100644
--- a/packages/PEGTL/include/tao/pegtl/cstream_input.hpp
+++ b/packages/PEGTL/include/tao/pegtl/cstream_input.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_CSTREAM_INPUT_HPP
 #define TAO_PEGTL_CSTREAM_INPUT_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/demangle.hpp b/packages/PEGTL/include/tao/pegtl/demangle.hpp
index 56cc170c6bd038b18cb95aa323ba033ccaeec45a..97b2606528390d15f53152cb235056ce2aa02c76 100644
--- a/packages/PEGTL/include/tao/pegtl/demangle.hpp
+++ b/packages/PEGTL/include/tao/pegtl/demangle.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_DEMANGLE_HPP
 #define TAO_PEGTL_DEMANGLE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/disable_action.hpp b/packages/PEGTL/include/tao/pegtl/disable_action.hpp
index 0781516f4ecd0cf39e938360d92aa76b96a958a3..92bc2f4d3c0f5bc1c182a14b085729af580bab5f 100644
--- a/packages/PEGTL/include/tao/pegtl/disable_action.hpp
+++ b/packages/PEGTL/include/tao/pegtl/disable_action.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_DISABLE_ACTION_HPP
 #define TAO_PEGTL_DISABLE_ACTION_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/discard_input.hpp b/packages/PEGTL/include/tao/pegtl/discard_input.hpp
index 61ddf0dd224e2d129b0b22ffb09f7a6ae3cb7301..dec0a9af57f812850f981ad141487e01d2baf869 100644
--- a/packages/PEGTL/include/tao/pegtl/discard_input.hpp
+++ b/packages/PEGTL/include/tao/pegtl/discard_input.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_DISCARD_INPUT_HPP
 #define TAO_PEGTL_DISCARD_INPUT_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/discard_input_on_failure.hpp b/packages/PEGTL/include/tao/pegtl/discard_input_on_failure.hpp
index 6e4a9cd3014504519b7edd5d50a0ac954dcdda8c..305038e96b4d44663d71508d932e848e35ed92aa 100644
--- a/packages/PEGTL/include/tao/pegtl/discard_input_on_failure.hpp
+++ b/packages/PEGTL/include/tao/pegtl/discard_input_on_failure.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_DISCARD_INPUT_ON_FAILURE_HPP
 #define TAO_PEGTL_DISCARD_INPUT_ON_FAILURE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/discard_input_on_success.hpp b/packages/PEGTL/include/tao/pegtl/discard_input_on_success.hpp
index 3edc271bdc0136d796ea2fbe0953f1efefd1edad..c5beb7d91e56bb68ea4a533e913ea36899ccd309 100644
--- a/packages/PEGTL/include/tao/pegtl/discard_input_on_success.hpp
+++ b/packages/PEGTL/include/tao/pegtl/discard_input_on_success.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_DISCARD_INPUT_ON_SUCCESS_HPP
 #define TAO_PEGTL_DISCARD_INPUT_ON_SUCCESS_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/enable_action.hpp b/packages/PEGTL/include/tao/pegtl/enable_action.hpp
index a87726f8852029e49fa73861d80a6e40c4075c88..ae0c09983afdb813ce806793beeb24cec0989140 100644
--- a/packages/PEGTL/include/tao/pegtl/enable_action.hpp
+++ b/packages/PEGTL/include/tao/pegtl/enable_action.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_ENABLE_ACTION_HPP
 #define TAO_PEGTL_ENABLE_ACTION_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/eol.hpp b/packages/PEGTL/include/tao/pegtl/eol.hpp
index 8c496da7ebcc4ec937c45e2057870178715493ae..c2df140e3edf028695f3be563f4942e075d09e4b 100644
--- a/packages/PEGTL/include/tao/pegtl/eol.hpp
+++ b/packages/PEGTL/include/tao/pegtl/eol.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2016-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_EOL_HPP
 #define TAO_PEGTL_EOL_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/eol_pair.hpp b/packages/PEGTL/include/tao/pegtl/eol_pair.hpp
index 239af370ead7b6d26e6c7d8aef12c6dd1dfa3bcf..b139d3bdccffb9726233a52cf65108a96a628525 100644
--- a/packages/PEGTL/include/tao/pegtl/eol_pair.hpp
+++ b/packages/PEGTL/include/tao/pegtl/eol_pair.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_EOL_PAIR_HPP
 #define TAO_PEGTL_EOL_PAIR_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/file_input.hpp b/packages/PEGTL/include/tao/pegtl/file_input.hpp
index 91895b31e4a9aad63b6968376eb9a2710e15b825..408b89fc0e2cbfdb62342c78d111e7ca5a475017 100644
--- a/packages/PEGTL/include/tao/pegtl/file_input.hpp
+++ b/packages/PEGTL/include/tao/pegtl/file_input.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2015-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_FILE_INPUT_HPP
 #define TAO_PEGTL_FILE_INPUT_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/action.hpp b/packages/PEGTL/include/tao/pegtl/internal/action.hpp
index 3443c299775424dc08b547662b9565826bd46149..8068241e87b14c3a72419038c30b29fd7da7f820 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/action.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/action.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_ACTION_HPP
 #define TAO_PEGTL_INTERNAL_ACTION_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/action_input.hpp b/packages/PEGTL/include/tao/pegtl/internal/action_input.hpp
index ccc2d16501d84ac9926fe900bad3af0765a5ed50..fa622fa326cf0e87d9b9dc3ce91dda531b753070 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/action_input.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/action_input.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2016-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_ACTION_INPUT_HPP
 #define TAO_PEGTL_INTERNAL_ACTION_INPUT_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/any.hpp b/packages/PEGTL/include/tao/pegtl/internal/any.hpp
index 59ba05f5af3d052e99355a759b977435853462e5..9e710c5f876d154e3f78f4f631b6da419f9ed9b2 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/any.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/any.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_ANY_HPP
 #define TAO_PEGTL_INTERNAL_ANY_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/apply.hpp b/packages/PEGTL/include/tao/pegtl/internal/apply.hpp
index cc5e9fd9e7f39de0d1da9413cb6cecad55d371e8..97a342b86a45a096fa90a50af91113c698a9a39a 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/apply.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/apply.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_APPLY_HPP
 #define TAO_PEGTL_INTERNAL_APPLY_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/apply0.hpp b/packages/PEGTL/include/tao/pegtl/internal/apply0.hpp
index a6e37f4e4d710f96a6deff53493d4ca920eb4750..2710de8e8ab89eea8b357b7de28f4e0dcd8b20d4 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/apply0.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/apply0.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_APPLY0_HPP
 #define TAO_PEGTL_INTERNAL_APPLY0_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/apply0_single.hpp b/packages/PEGTL/include/tao/pegtl/internal/apply0_single.hpp
index 11939064d828dbf41e00dce2cca02e5765bd2a04..167031924635e65c26d8a67093556fc4e33f67b5 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/apply0_single.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/apply0_single.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_APPLY0_SINGLE_HPP
 #define TAO_PEGTL_INTERNAL_APPLY0_SINGLE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/apply_single.hpp b/packages/PEGTL/include/tao/pegtl/internal/apply_single.hpp
index 57b0ca8fc0e66c0093ec087953a86076c765b1db..728ad4772e6d85a23f9f3d47bdde272543472b45 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/apply_single.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/apply_single.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_APPLY_SINGLE_HPP
 #define TAO_PEGTL_INTERNAL_APPLY_SINGLE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/at.hpp b/packages/PEGTL/include/tao/pegtl/internal/at.hpp
index 7f7d2894c52fe42c45d3736af80d103b2357035a..117b5a429d943d5c224e4afcb3254c266bb600a5 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/at.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/at.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_AT_HPP
 #define TAO_PEGTL_INTERNAL_AT_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/bof.hpp b/packages/PEGTL/include/tao/pegtl/internal/bof.hpp
index da5c920bcc8e30dd16389efaae616beeb1558053..7038d953f77dd727003bcd7d3e526f9d74b0e9e8 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/bof.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/bof.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_BOF_HPP
 #define TAO_PEGTL_INTERNAL_BOF_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/bol.hpp b/packages/PEGTL/include/tao/pegtl/internal/bol.hpp
index a71109d77c3b3bb8a65d4e116f766c0dc8931d0a..759a8bd01cf91665908301f5a25ad4365ce1c094 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/bol.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/bol.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_BOL_HPP
 #define TAO_PEGTL_INTERNAL_BOL_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/bump.hpp b/packages/PEGTL/include/tao/pegtl/internal/bump.hpp
index 02e2278a264596c4cfed965bfa249c91fc41b6d9..1b305f3a2f65187c05c38f16c662e81cec364d8d 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/bump.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/bump.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_BUMP_HPP
 #define TAO_PEGTL_INTERNAL_BUMP_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/bump_help.hpp b/packages/PEGTL/include/tao/pegtl/internal/bump_help.hpp
index 68c82b99758b59f8e960874e56e2534eefc69ca2..f40c6171cb97390956532ea7ae67b92decb3241b 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/bump_help.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/bump_help.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2015-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_BUMP_HELP_HPP
 #define TAO_PEGTL_INTERNAL_BUMP_HELP_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/bytes.hpp b/packages/PEGTL/include/tao/pegtl/internal/bytes.hpp
index 476ce965df766c1869edc11657b8bdc6b1084873..2012f05c74c340ec6084d0855aae830d53b49a9f 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/bytes.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/bytes.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_BYTES_HPP
 #define TAO_PEGTL_INTERNAL_BYTES_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/control.hpp b/packages/PEGTL/include/tao/pegtl/internal/control.hpp
index e434ae06769efe1884ab87b71d898cba24fc764f..a202f5d43922327f9ac482f42a443ccb3fe776aa 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/control.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/control.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_CONTROL_HPP
 #define TAO_PEGTL_INTERNAL_CONTROL_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/cr_crlf_eol.hpp b/packages/PEGTL/include/tao/pegtl/internal/cr_crlf_eol.hpp
index 22504513922d6ace165d1cef4cf51bb27910e6df..cf2e72ec8a74b11f8b89526b1d861233b4b6b137 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/cr_crlf_eol.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/cr_crlf_eol.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2016-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_CR_CRLF_EOL_HPP
 #define TAO_PEGTL_INTERNAL_CR_CRLF_EOL_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/cr_eol.hpp b/packages/PEGTL/include/tao/pegtl/internal/cr_eol.hpp
index 833a7c0c31b193f5b0240991e32018fc46d385f6..5d1a61951134c91904c87bb09d3cfbd9925872f5 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/cr_eol.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/cr_eol.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2016-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_CR_EOL_HPP
 #define TAO_PEGTL_INTERNAL_CR_EOL_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/crlf_eol.hpp b/packages/PEGTL/include/tao/pegtl/internal/crlf_eol.hpp
index 5b71de8306191a885e6352303f68ddbadb51dd8b..f419a6f483ec64097884988a33ebd04240d14c0c 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/crlf_eol.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/crlf_eol.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2016-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_CRLF_EOL_HPP
 #define TAO_PEGTL_INTERNAL_CRLF_EOL_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/cstream_reader.hpp b/packages/PEGTL/include/tao/pegtl/internal/cstream_reader.hpp
index e8ca048c3e202fe2df558b8cb33b13ecf6ef7898..0dc630539a2d9a0b066986635627a29197b827d1 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/cstream_reader.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/cstream_reader.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2016-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_CSTREAM_READER_HPP
 #define TAO_PEGTL_INTERNAL_CSTREAM_READER_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/cstring_reader.hpp b/packages/PEGTL/include/tao/pegtl/internal/cstring_reader.hpp
index c98044c812d3f3a4f48bc41bc279c4243a2c5993..4c3b10e7496daccf5c75781f3b76f1c3f52b1905 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/cstring_reader.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/cstring_reader.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2016-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_CSTRING_READER_HPP
 #define TAO_PEGTL_INTERNAL_CSTRING_READER_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/dependent_false.hpp b/packages/PEGTL/include/tao/pegtl/internal/dependent_false.hpp
index 57661a415d295fbdb559518964e91b326f41e896..a5cc470a3a6cdb78b7f8f08d15ff39de33fbfa68 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/dependent_false.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/dependent_false.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_DEPENDENT_FALSE_HPP
 #define TAO_PEGTL_INTERNAL_DEPENDENT_FALSE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/disable.hpp b/packages/PEGTL/include/tao/pegtl/internal/disable.hpp
index 9a294d3b8415bcf2733902dac9e61f9af3ec3ca2..76bdc5a8e1c0cbb0c6903c8bf25dc830856bd24c 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/disable.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/disable.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_DISABLE_HPP
 #define TAO_PEGTL_INTERNAL_DISABLE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/discard.hpp b/packages/PEGTL/include/tao/pegtl/internal/discard.hpp
index b92760d04479a95d0d21a9f2a5bb2807004c75a3..bea7aa3f2dace46471fcc4be7e683f7c22db1c36 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/discard.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/discard.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2016-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_DISCARD_HPP
 #define TAO_PEGTL_INTERNAL_DISCARD_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/enable.hpp b/packages/PEGTL/include/tao/pegtl/internal/enable.hpp
index 9c453435c97d5efe2be4d7e46da8ba2f8dc612e2..cb4fbe9dbc7a4db4bf9729b4a7bc3d93c8054379 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/enable.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/enable.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_ENABLE_HPP
 #define TAO_PEGTL_INTERNAL_ENABLE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/enable_control.hpp b/packages/PEGTL/include/tao/pegtl/internal/enable_control.hpp
index 16ca4236900044b7f29930d739a569d614d14205..4fe5ae199cbbc1fe5d7950d4250cf7066763b555 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/enable_control.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/enable_control.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_ENABLE_CONTROL_HPP
 #define TAO_PEGTL_INTERNAL_ENABLE_CONTROL_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/eof.hpp b/packages/PEGTL/include/tao/pegtl/internal/eof.hpp
index c4f0f81321811fa876c7e79894e7f8f0fcee4540..e227e0474fd0f19b40a0b070d599c77422972ee4 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/eof.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/eof.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_EOF_HPP
 #define TAO_PEGTL_INTERNAL_EOF_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/eol.hpp b/packages/PEGTL/include/tao/pegtl/internal/eol.hpp
index ecec04f32defd31c8f65e27abc0a0070e12219ba..6f8633afc96504de82d797af0aaf5563f02698c5 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/eol.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/eol.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2016-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_EOL_HPP
 #define TAO_PEGTL_INTERNAL_EOL_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/eolf.hpp b/packages/PEGTL/include/tao/pegtl/internal/eolf.hpp
index 71fa8d891398473c985b8fe7e055b7f7bed462f5..101605b9e50d30e875722749e8c76ec1fee9203b 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/eolf.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/eolf.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_EOLF_HPP
 #define TAO_PEGTL_INTERNAL_EOLF_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/failure.hpp b/packages/PEGTL/include/tao/pegtl/internal/failure.hpp
index 75084997700d74d50672e0a6974de0ff4f08fafd..8f14322b8167f6466dd51a456f289a0c352b8a1a 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/failure.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/failure.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_FAILURE_HPP
 #define TAO_PEGTL_INTERNAL_FAILURE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/file_mapper_posix.hpp b/packages/PEGTL/include/tao/pegtl/internal/file_mapper_posix.hpp
index 68bb8f6d65c174a0ef04cc96023fb0672aaa8697..def276bce3bab1a100934f66f45946dde0baf31c 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/file_mapper_posix.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/file_mapper_posix.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_FILE_MAPPER_POSIX_HPP
 #define TAO_PEGTL_INTERNAL_FILE_MAPPER_POSIX_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/file_mapper_win32.hpp b/packages/PEGTL/include/tao/pegtl/internal/file_mapper_win32.hpp
index 73c27dec90556f0b091ed191a28ca6c4557e3d95..4a353ccf4fd0feda569010d281352ed39ba42b66 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/file_mapper_win32.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/file_mapper_win32.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_FILE_MAPPER_WIN32_HPP
 #define TAO_PEGTL_INTERNAL_FILE_MAPPER_WIN32_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/file_reader.hpp b/packages/PEGTL/include/tao/pegtl/internal/file_reader.hpp
index 7c2b054942877d96d308ea4be0a27e49e1917f00..a842f4bd005724dea283075b7fbcca07c66b9b0d 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/file_reader.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/file_reader.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_FILE_READER_HPP
 #define TAO_PEGTL_INTERNAL_FILE_READER_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/filesystem.hpp b/packages/PEGTL/include/tao/pegtl/internal/filesystem.hpp
index ad241abd222ac917a8032b91bdb2be3e8c62cc00..fdb5e2f1013cf8af8dd337724a6a33690d512ed5 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/filesystem.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/filesystem.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_FILESYSTEM_HPP
 #define TAO_PEGTL_INTERNAL_FILESYSTEM_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/has_apply.hpp b/packages/PEGTL/include/tao/pegtl/internal/has_apply.hpp
index 697f6ff3c62528307a65a733497a305149fdd3d1..5b746ebccb65a25f284c471d9c20445a3da7ff91 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/has_apply.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/has_apply.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_HAS_APPLY_HPP
 #define TAO_PEGTL_INTERNAL_HAS_APPLY_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/has_apply0.hpp b/packages/PEGTL/include/tao/pegtl/internal/has_apply0.hpp
index 3f5fc599d3703110a6d568d236bd74ecdafeb06b..cb14882ac98895a1ebf86f7743b1b1624f4b42ce 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/has_apply0.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/has_apply0.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_HAS_APPLY0_HPP
 #define TAO_PEGTL_INTERNAL_HAS_APPLY0_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/has_match.hpp b/packages/PEGTL/include/tao/pegtl/internal/has_match.hpp
index 0606d7a5a3d42c22fef50e80dc75908b63bccbb2..ab842587437d7e32e36ab6dfabb64fad61e2dbae 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/has_match.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/has_match.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_HAS_MATCH_HPP
 #define TAO_PEGTL_INTERNAL_HAS_MATCH_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/has_unwind.hpp b/packages/PEGTL/include/tao/pegtl/internal/has_unwind.hpp
index 566c24e4e5f2cc14b8f54cadc4a5c68eb2480653..8714d93725868dfef1a148c6c6449a8841b26397 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/has_unwind.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/has_unwind.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_HAS_UNWIND_HPP
 #define TAO_PEGTL_INTERNAL_HAS_UNWIND_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/identifier.hpp b/packages/PEGTL/include/tao/pegtl/internal/identifier.hpp
index 4ecbbb3244959798596a667842bc6524a7e4d8da..fc78c220b993192dc47bc9375e933fd3d5ef374c 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/identifier.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/identifier.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_IDENTIFIER_HPP
 #define TAO_PEGTL_INTERNAL_IDENTIFIER_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/if_apply.hpp b/packages/PEGTL/include/tao/pegtl/internal/if_apply.hpp
index 27dd002abd37be48b5d75d45397adec761745cd8..797be335928f20dc3bb16772ca59c45f9e1af215 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/if_apply.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/if_apply.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_IF_APPLY_HPP
 #define TAO_PEGTL_INTERNAL_IF_APPLY_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/if_must.hpp b/packages/PEGTL/include/tao/pegtl/internal/if_must.hpp
index 4cf2260e10f32406c4a3d04c589531304261ed46..9f5e0bacc452056815610a0cce362aaf22fb2aa4 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/if_must.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/if_must.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_IF_MUST_HPP
 #define TAO_PEGTL_INTERNAL_IF_MUST_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/if_must_else.hpp b/packages/PEGTL/include/tao/pegtl/internal/if_must_else.hpp
index 685c7ca211bc0615a88369dc6803e46c1fc77b44..51ad934824b5a48cb010fc909903c3691e3f05ea 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/if_must_else.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/if_must_else.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_IF_MUST_ELSE_HPP
 #define TAO_PEGTL_INTERNAL_IF_MUST_ELSE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/if_then_else.hpp b/packages/PEGTL/include/tao/pegtl/internal/if_then_else.hpp
index bc6fa3cea2484526cfe0589763b8a1b1404b6b7c..dfe62ceb957f1943102f0837f4742fb64a3afa0a 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/if_then_else.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/if_then_else.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_IF_THEN_ELSE_HPP
 #define TAO_PEGTL_INTERNAL_IF_THEN_ELSE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/input_pair.hpp b/packages/PEGTL/include/tao/pegtl/internal/input_pair.hpp
index a3bf92210652df71f37f0af4271e8667aad442bf..423375c23f311ca3269a725280b8204b0a2484ac 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/input_pair.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/input_pair.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_INPUT_PAIR_HPP
 #define TAO_PEGTL_INTERNAL_INPUT_PAIR_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/istream_reader.hpp b/packages/PEGTL/include/tao/pegtl/internal/istream_reader.hpp
index 371ffbab5c27bbdf97432f2b3adbc39451cff64d..4b67437194f2e69f73ab6762e58a5ffb941c9c26 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/istream_reader.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/istream_reader.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2016-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_ISTREAM_READER_HPP
 #define TAO_PEGTL_INTERNAL_ISTREAM_READER_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/istring.hpp b/packages/PEGTL/include/tao/pegtl/internal/istring.hpp
index f7abe365feef29043788340c99b4d2b9bcddc4b4..5969c0672f8e2075f691daa0328fcd359cccbf04 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/istring.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/istring.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_ISTRING_HPP
 #define TAO_PEGTL_INTERNAL_ISTRING_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/iterator.hpp b/packages/PEGTL/include/tao/pegtl/internal/iterator.hpp
index 758016772646b38eb436b712fe1ccc6870100a36..96099cab899619a0cab5815caa8a79aff3e4128d 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/iterator.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/iterator.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_ITERATOR_HPP
 #define TAO_PEGTL_INTERNAL_ITERATOR_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/lf_crlf_eol.hpp b/packages/PEGTL/include/tao/pegtl/internal/lf_crlf_eol.hpp
index c3eae42c1b1a2e32ba85071926bab84d119c3cfc..ac266014a7f07f1990149e47241cc3e38d1c2f09 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/lf_crlf_eol.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/lf_crlf_eol.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2016-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_LF_CRLF_EOL_HPP
 #define TAO_PEGTL_INTERNAL_LF_CRLF_EOL_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/lf_eol.hpp b/packages/PEGTL/include/tao/pegtl/internal/lf_eol.hpp
index 0f4e33d64925aa4fd75888e75407d7803e85da28..d0f3adceb71904a82e4848f253b4ea6a02d4b31b 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/lf_eol.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/lf_eol.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2016-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_LF_EOL_HPP
 #define TAO_PEGTL_INTERNAL_LF_EOL_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/list.hpp b/packages/PEGTL/include/tao/pegtl/internal/list.hpp
index 07318e6572ac825d67e8eb562a85d81b6e62e889..24a3935163bfa627e80cc21df792f9cca5d3f41a 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/list.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/list.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_LIST_HPP
 #define TAO_PEGTL_INTERNAL_LIST_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/list_must.hpp b/packages/PEGTL/include/tao/pegtl/internal/list_must.hpp
index aff1786960e1e4e053f95232af296d665d5faf76..1d9cef89ef7b31f22512bc1f426981a43409d081 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/list_must.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/list_must.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_LIST_MUST_HPP
 #define TAO_PEGTL_INTERNAL_LIST_MUST_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/list_tail.hpp b/packages/PEGTL/include/tao/pegtl/internal/list_tail.hpp
index 5fe3ff413ed1be126fccc90c94f2d362fa5cfbbf..59fae07ff18b969770d379084ac84752a57721ac 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/list_tail.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/list_tail.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_LIST_TAIL_HPP
 #define TAO_PEGTL_INTERNAL_LIST_TAIL_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/list_tail_pad.hpp b/packages/PEGTL/include/tao/pegtl/internal/list_tail_pad.hpp
index b96c626b8e8fe4b233a4cbcd4a2e28f4ba99f92c..f204b23e83f09302731442a12a71f8aae79fe27e 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/list_tail_pad.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/list_tail_pad.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_LIST_TAIL_PAD_HPP
 #define TAO_PEGTL_INTERNAL_LIST_TAIL_PAD_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/marker.hpp b/packages/PEGTL/include/tao/pegtl/internal/marker.hpp
index a9ffa824f356e85addebb0c99ddab3373c62b3df..a061aea738eef1866887be743e4f6125605526a3 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/marker.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/marker.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_MARKER_HPP
 #define TAO_PEGTL_INTERNAL_MARKER_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/minus.hpp b/packages/PEGTL/include/tao/pegtl/internal/minus.hpp
index 4062f420e6bb7a301bb71b964b02a49249ba5562..f89ec02c8bf04a4937a5f8a0036657899d63aa5d 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/minus.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/minus.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_MINUS_HPP
 #define TAO_PEGTL_INTERNAL_MINUS_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/missing_apply.hpp b/packages/PEGTL/include/tao/pegtl/internal/missing_apply.hpp
index e69cbf3a9b30187091b0dbe3699f434aa9dd54fb..b979c1420357a2d24dd6762a8a384d899c3700c9 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/missing_apply.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/missing_apply.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_MISSING_APPLY_HPP
 #define TAO_PEGTL_INTERNAL_MISSING_APPLY_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/missing_apply0.hpp b/packages/PEGTL/include/tao/pegtl/internal/missing_apply0.hpp
index 2b63a28e5a50f30363b8761c2a7f4eb3bc7e9213..fedd156c0cb6b622488f261545effc523db15742 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/missing_apply0.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/missing_apply0.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_MISSING_APPLY0_HPP
 #define TAO_PEGTL_INTERNAL_MISSING_APPLY0_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/must.hpp b/packages/PEGTL/include/tao/pegtl/internal/must.hpp
index 213fc56eb2f0c336ea63d0c78c801c2cceeeb6e0..cc77f705814392f5f43af452367d84c600172819 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/must.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/must.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_MUST_HPP
 #define TAO_PEGTL_INTERNAL_MUST_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/not_at.hpp b/packages/PEGTL/include/tao/pegtl/internal/not_at.hpp
index 00758969724a0838d1a3f0d93c20997afffe9d1c..2e27e2d629795cfd5b90ec53829bc5c974459c83 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/not_at.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/not_at.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_NOT_AT_HPP
 #define TAO_PEGTL_INTERNAL_NOT_AT_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/one.hpp b/packages/PEGTL/include/tao/pegtl/internal/one.hpp
index e2bc05ef41cc1bc599ab0be5ce0fe73f3577e8ad..2eac40f659d9d982b7b60b2200b10e6495551ac1 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/one.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/one.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_ONE_HPP
 #define TAO_PEGTL_INTERNAL_ONE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/opt.hpp b/packages/PEGTL/include/tao/pegtl/internal/opt.hpp
index 9dd1a0cf419401bc657ec1d6f68b959db58529fe..4a10afe77090dea98fe3bcab1d4e570514f1318c 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/opt.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/opt.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_OPT_HPP
 #define TAO_PEGTL_INTERNAL_OPT_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/pad.hpp b/packages/PEGTL/include/tao/pegtl/internal/pad.hpp
index ff49ce22c78dc9626fb62913e4973805745b073d..13b473e2479b27f6fb738f1b7d9bf4a18b774599 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/pad.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/pad.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_PAD_HPP
 #define TAO_PEGTL_INTERNAL_PAD_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/pad_opt.hpp b/packages/PEGTL/include/tao/pegtl/internal/pad_opt.hpp
index 02af2363a9a8fffb48f777e66058c7aa78ef29da..80b5064f796b8eb8818b8946742afc80871e69ec 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/pad_opt.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/pad_opt.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_PAD_OPT_HPP
 #define TAO_PEGTL_INTERNAL_PAD_OPT_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/path_to_string.hpp b/packages/PEGTL/include/tao/pegtl/internal/path_to_string.hpp
index d28dbac739888cda5138ec6ea399ad2ae01036a6..7d7757a0bdf50f785f95bf7c6980dde0731c5b5d 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/path_to_string.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/path_to_string.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_PATH_TO_STRING_HPP
 #define TAO_PEGTL_INTERNAL_PATH_TO_STRING_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/peek_char.hpp b/packages/PEGTL/include/tao/pegtl/internal/peek_char.hpp
index 7dbb06f699faaeca7e4d8f9808e06a6505a5b93f..e85e5607d94e95ce669cc2c9995d159928adb25c 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/peek_char.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/peek_char.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_PEEK_CHAR_HPP
 #define TAO_PEGTL_INTERNAL_PEEK_CHAR_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/peek_utf8.hpp b/packages/PEGTL/include/tao/pegtl/internal/peek_utf8.hpp
index 77f623d1069075b687f96053a70cfc9172f0cc78..a47d813a361d1af48209703b1bd6e108fd926c70 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/peek_utf8.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/peek_utf8.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_PEEK_UTF8_HPP
 #define TAO_PEGTL_INTERNAL_PEEK_UTF8_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/pegtl_string.hpp b/packages/PEGTL/include/tao/pegtl/internal/pegtl_string.hpp
index aac858d018e4300e0bf817c819776fd640bb14c4..576c216d286a45c7c8df26b8a6351526b5143855 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/pegtl_string.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/pegtl_string.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2015-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_PEGTL_STRING_HPP
 #define TAO_PEGTL_INTERNAL_PEGTL_STRING_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/plus.hpp b/packages/PEGTL/include/tao/pegtl/internal/plus.hpp
index 4a0ee5a99d9a7d6bf9e1f294d3794d35cda62e70..2ebdd3e62e42437d299fd4b38f7ff3ad338b4aaf 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/plus.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/plus.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_PLUS_HPP
 #define TAO_PEGTL_INTERNAL_PLUS_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/raise.hpp b/packages/PEGTL/include/tao/pegtl/internal/raise.hpp
index 9d9c831889c5be7b2ee2654f11565bb19511bef0..11db9fce43efc3c33d438eddabbba4c6f02e148f 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/raise.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/raise.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_RAISE_HPP
 #define TAO_PEGTL_INTERNAL_RAISE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/range.hpp b/packages/PEGTL/include/tao/pegtl/internal/range.hpp
index d1ad9e98dde2d61f40e9689032299042706e1122..f504161d1c93485b4643c4ae74e055695b974ee7 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/range.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/range.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_RANGE_HPP
 #define TAO_PEGTL_INTERNAL_RANGE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/ranges.hpp b/packages/PEGTL/include/tao/pegtl/internal/ranges.hpp
index bf57db611e00058ae681096676cb5893efe7b355..9df06d7b3d756778b2fc64dd1938378f7e156468 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/ranges.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/ranges.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_RANGES_HPP
 #define TAO_PEGTL_INTERNAL_RANGES_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/rematch.hpp b/packages/PEGTL/include/tao/pegtl/internal/rematch.hpp
index a1f7e3496c283944bbbc9baaa270162677a7936f..4877a3401e6d9f81d8b29159f1742f15dd6608dc 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/rematch.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/rematch.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_REMATCH_HPP
 #define TAO_PEGTL_INTERNAL_REMATCH_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/rep.hpp b/packages/PEGTL/include/tao/pegtl/internal/rep.hpp
index 3ef122d64452670b63c5eeed229f1cfbcaff1d49..79a2be78917f73779e906b0e9c36b1dc552ab8c4 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/rep.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/rep.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_REP_HPP
 #define TAO_PEGTL_INTERNAL_REP_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/rep_min.hpp b/packages/PEGTL/include/tao/pegtl/internal/rep_min.hpp
index 73f8531aa3e7d56bd2f62f13bb85e263ff9d1ada..a6efe9e6734f44152544c4fc5ad8111ec662c21e 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/rep_min.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/rep_min.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_REP_MIN_HPP
 #define TAO_PEGTL_INTERNAL_REP_MIN_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/rep_min_max.hpp b/packages/PEGTL/include/tao/pegtl/internal/rep_min_max.hpp
index b600152f019e1d2362573215879a3c80a93f2ead..b0026458efa40eb655aeac6f2dede2ab7111836c 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/rep_min_max.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/rep_min_max.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_REP_MIN_MAX_HPP
 #define TAO_PEGTL_INTERNAL_REP_MIN_MAX_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/rep_opt.hpp b/packages/PEGTL/include/tao/pegtl/internal/rep_opt.hpp
index 435332d45fb393d6521526f257327bbc95b9d926..bea26e5be97d4ff612e2c68b6e19c22d21094009 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/rep_opt.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/rep_opt.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_REP_OPT_HPP
 #define TAO_PEGTL_INTERNAL_REP_OPT_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/require.hpp b/packages/PEGTL/include/tao/pegtl/internal/require.hpp
index f2ce5a02280a90f72a787594f047da2c9ed9e834..b4e24fb158c13e5bc4c3b697e8e58f2c5c20be3f 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/require.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/require.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2016-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_REQUIRE_HPP
 #define TAO_PEGTL_INTERNAL_REQUIRE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/result_on_found.hpp b/packages/PEGTL/include/tao/pegtl/internal/result_on_found.hpp
index 09fa6ed6ee070ad76d1ba2c67e82bc7825847ebf..df2bcd77b0e9a3d9c97783d456c0e1dd0aed0e6c 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/result_on_found.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/result_on_found.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_RESULT_ON_FOUND_HPP
 #define TAO_PEGTL_INTERNAL_RESULT_ON_FOUND_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/rules.hpp b/packages/PEGTL/include/tao/pegtl/internal/rules.hpp
index 5496a94b8fa261133e276e0e2fddbcffbb871a61..2f89826f167b17fdb77da2a21e20ea930c42925f 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/rules.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/rules.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_RULES_HPP
 #define TAO_PEGTL_INTERNAL_RULES_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/seq.hpp b/packages/PEGTL/include/tao/pegtl/internal/seq.hpp
index 11af3dcca8f208cdecd6b1a4e85b9ed2b5f53701..eb0a8624d7eea11ef11996f3885dff7092d6197a 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/seq.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/seq.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_SEQ_HPP
 #define TAO_PEGTL_INTERNAL_SEQ_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/sor.hpp b/packages/PEGTL/include/tao/pegtl/internal/sor.hpp
index 0a8fcdf73a718634b8d208fc3f07e40bd2766e04..b09d6779c0cd781257028754012ab5f116afb35b 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/sor.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/sor.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_SOR_HPP
 #define TAO_PEGTL_INTERNAL_SOR_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/star.hpp b/packages/PEGTL/include/tao/pegtl/internal/star.hpp
index de779dffc5fed4c610e6767b99836efab2628fcb..bc2360bf759b911ef7c27b74f9423fdbbb142a45 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/star.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/star.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_STAR_HPP
 #define TAO_PEGTL_INTERNAL_STAR_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/star_must.hpp b/packages/PEGTL/include/tao/pegtl/internal/star_must.hpp
index 2ac45e42bc2cef61d459ec5d9e8750decb393174..8dff5771bef509bce1fb6634aea0f75006b89f7e 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/star_must.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/star_must.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_STAR_MUST_HPP
 #define TAO_PEGTL_INTERNAL_STAR_MUST_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/state.hpp b/packages/PEGTL/include/tao/pegtl/internal/state.hpp
index e86965bf3bf3db50a484044d67741c246d82beeb..4ed8d22c859b10782101168cb60320d8da5047d0 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/state.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/state.hpp
@@ -1,11 +1,15 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_STATE_HPP
 #define TAO_PEGTL_INTERNAL_STATE_HPP
 
 #include "../config.hpp"
 
+#include <type_traits>
+
+#include "dependent_false.hpp"
 #include "enable_control.hpp"
 #include "seq.hpp"
 #include "success.hpp"
@@ -16,18 +20,18 @@
 
 namespace TAO_PEGTL_NAMESPACE::internal
 {
-   template< typename State, typename... Rules >
+   template< typename NewState, typename... Rules >
    struct state
-      : state< State, seq< Rules... > >
+      : state< NewState, seq< Rules... > >
    {};
 
-   template< typename State >
-   struct state< State >
+   template< typename NewState >
+   struct state< NewState >
       : success
    {};
 
-   template< typename State, typename Rule >
-   struct state< State, Rule >
+   template< typename NewState, typename Rule >
+   struct state< NewState, Rule >
    {
       using rule_t = state;
       using subs_t = type_list< Rule >;
@@ -42,17 +46,30 @@ namespace TAO_PEGTL_NAMESPACE::internal
                 typename... States >
       [[nodiscard]] static bool match( ParseInput& in, States&&... st )
       {
-         State s( static_cast< const ParseInput& >( in ), st... );
-         if( Control< Rule >::template match< A, M, Action, Control >( in, s ) ) {
-            s.success( static_cast< const ParseInput& >( in ), st... );
-            return true;
+         if constexpr( std::is_constructible_v< NewState, const ParseInput&, States... > ) {
+            NewState s( static_cast< const ParseInput& >( in ), st... );
+            if( Control< Rule >::template match< A, M, Action, Control >( in, s ) ) {
+               s.success( static_cast< const ParseInput& >( in ), st... );
+               return true;
+            }
+            return false;
+         }
+         else if constexpr( std::is_default_constructible_v< NewState > ) {
+            NewState s;
+            if( Control< Rule >::template match< A, M, Action, Control >( in, s ) ) {
+               s.success( static_cast< const ParseInput& >( in ), st... );
+               return true;
+            }
+            return false;
+         }
+         else {
+            static_assert( internal::dependent_false< NewState >, "unable to instantiate new state" );
          }
-         return false;
       }
    };
 
-   template< typename State, typename... Rules >
-   inline constexpr bool enable_control< state< State, Rules... > > = false;
+   template< typename NewState, typename... Rules >
+   inline constexpr bool enable_control< state< NewState, Rules... > > = false;
 
 }  // namespace TAO_PEGTL_NAMESPACE::internal
 
diff --git a/packages/PEGTL/include/tao/pegtl/internal/string.hpp b/packages/PEGTL/include/tao/pegtl/internal/string.hpp
index d659ab8b328534096af1e256ab0208285c967083..4ad5fcb548ef922fa21b4e3353d9d7eda83d53d1 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/string.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/string.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_STRING_HPP
 #define TAO_PEGTL_INTERNAL_STRING_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/success.hpp b/packages/PEGTL/include/tao/pegtl/internal/success.hpp
index 03ea08d7d4c394b10e250f7ee3cdd074e9b3a026..706122bd578b2e98ae425d563556e5bbc91addd3 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/success.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/success.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_SUCCESS_HPP
 #define TAO_PEGTL_INTERNAL_SUCCESS_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/try_catch_type.hpp b/packages/PEGTL/include/tao/pegtl/internal/try_catch_type.hpp
index 4e33287e50ad485bc15f9d54406f29f113a9cbfe..ebeb3de01e3ede936f55682d9984f4616a26e930 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/try_catch_type.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/try_catch_type.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_TRY_CATCH_TYPE_HPP
 #define TAO_PEGTL_INTERNAL_TRY_CATCH_TYPE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/internal/until.hpp b/packages/PEGTL/include/tao/pegtl/internal/until.hpp
index b5c5e2152c0a85aa8a94d0cdb97bcf8efc2da0af..b6efd3a18a963feae054cae9f05e363bb96145ba 100644
--- a/packages/PEGTL/include/tao/pegtl/internal/until.hpp
+++ b/packages/PEGTL/include/tao/pegtl/internal/until.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_INTERNAL_UNTIL_HPP
 #define TAO_PEGTL_INTERNAL_UNTIL_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/istream_input.hpp b/packages/PEGTL/include/tao/pegtl/istream_input.hpp
index 893316e5c8e06dd1f19f85c7ffa63c8f34b73f22..f37c1bc9653bddfaffaa5656d8e33914b1072278 100644
--- a/packages/PEGTL/include/tao/pegtl/istream_input.hpp
+++ b/packages/PEGTL/include/tao/pegtl/istream_input.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_ISTREAM_INPUT_HPP
 #define TAO_PEGTL_ISTREAM_INPUT_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/match.hpp b/packages/PEGTL/include/tao/pegtl/match.hpp
index 6ac9f6178a1e17f2f103b687af614b4c027eb87c..ad42582286d1a1e2364d302a60dec8a2aba9aa9a 100644
--- a/packages/PEGTL/include/tao/pegtl/match.hpp
+++ b/packages/PEGTL/include/tao/pegtl/match.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_MATCH_HPP
 #define TAO_PEGTL_MATCH_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/memory_input.hpp b/packages/PEGTL/include/tao/pegtl/memory_input.hpp
index 7cdc9f9f88aa5a2adbaf3158730217d3a2176334..c20887131eb2beff684f74217df55ea5c90f31b0 100644
--- a/packages/PEGTL/include/tao/pegtl/memory_input.hpp
+++ b/packages/PEGTL/include/tao/pegtl/memory_input.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_MEMORY_INPUT_HPP
 #define TAO_PEGTL_MEMORY_INPUT_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/mmap_input.hpp b/packages/PEGTL/include/tao/pegtl/mmap_input.hpp
index 2362b6c765747a6ccbe7964417434f32fe2bf04f..759ea7035521ff20aac72e5a8d0068f9c25c26fb 100644
--- a/packages/PEGTL/include/tao/pegtl/mmap_input.hpp
+++ b/packages/PEGTL/include/tao/pegtl/mmap_input.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_MMAP_INPUT_HPP
 #define TAO_PEGTL_MMAP_INPUT_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/must_if.hpp b/packages/PEGTL/include/tao/pegtl/must_if.hpp
index 0140a57a6693d93f5837f2c54d1e8dd05cfb3920..875e7e9c2a20e0a6c948dfcbeeba0f42e7ff4a4a 100644
--- a/packages/PEGTL/include/tao/pegtl/must_if.hpp
+++ b/packages/PEGTL/include/tao/pegtl/must_if.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_MUST_IF_HPP
 #define TAO_PEGTL_MUST_IF_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/normal.hpp b/packages/PEGTL/include/tao/pegtl/normal.hpp
index ac9d77e26585a8307298e8da256b69b008ee2cf8..bdbdd85eedf79a606db8a376c473e7a6ac9f47c5 100644
--- a/packages/PEGTL/include/tao/pegtl/normal.hpp
+++ b/packages/PEGTL/include/tao/pegtl/normal.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_NORMAL_HPP
 #define TAO_PEGTL_NORMAL_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/nothing.hpp b/packages/PEGTL/include/tao/pegtl/nothing.hpp
index 302b43555cf55adbc70a9fbdafad9b1451968e18..28b68f8449c09c81303f53d9a965aadcac05bf74 100644
--- a/packages/PEGTL/include/tao/pegtl/nothing.hpp
+++ b/packages/PEGTL/include/tao/pegtl/nothing.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_NOTHING_HPP
 #define TAO_PEGTL_NOTHING_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/parse.hpp b/packages/PEGTL/include/tao/pegtl/parse.hpp
index 3888fb3fc3ed619423ad4a3af5abaa2510fa9e3c..962f3051a2bdc94af2efc7157ba5048e64714db0 100644
--- a/packages/PEGTL/include/tao/pegtl/parse.hpp
+++ b/packages/PEGTL/include/tao/pegtl/parse.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_PARSE_HPP
 #define TAO_PEGTL_PARSE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/parse_error.hpp b/packages/PEGTL/include/tao/pegtl/parse_error.hpp
index df62ce6811129c929d2bd2edec924c08fa24fc8a..6a0ad019510caf8aa95132f1766d3806b11f99c7 100644
--- a/packages/PEGTL/include/tao/pegtl/parse_error.hpp
+++ b/packages/PEGTL/include/tao/pegtl/parse_error.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_PARSE_ERROR_HPP
 #define TAO_PEGTL_PARSE_ERROR_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/position.hpp b/packages/PEGTL/include/tao/pegtl/position.hpp
index b4f1c3b263f12090ae779cdd6fc1fb6c6a35751e..78ed32c9f35db5355f4df69ea6f43beed440d1e8 100644
--- a/packages/PEGTL/include/tao/pegtl/position.hpp
+++ b/packages/PEGTL/include/tao/pegtl/position.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_POSITION_HPP
 #define TAO_PEGTL_POSITION_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/read_input.hpp b/packages/PEGTL/include/tao/pegtl/read_input.hpp
index 019c9cd20b806f28772825a1bd9fa319b4c818a8..adbad4ca0484f8aa3669e92948f4faa3bff023f5 100644
--- a/packages/PEGTL/include/tao/pegtl/read_input.hpp
+++ b/packages/PEGTL/include/tao/pegtl/read_input.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_READ_INPUT_HPP
 #define TAO_PEGTL_READ_INPUT_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/require_apply.hpp b/packages/PEGTL/include/tao/pegtl/require_apply.hpp
index 1200909ae8bcd3628f92536e7c3b7fc592b39376..7eaa5cc07bc2d85f5ed8d3037b03e8200d8110d1 100644
--- a/packages/PEGTL/include/tao/pegtl/require_apply.hpp
+++ b/packages/PEGTL/include/tao/pegtl/require_apply.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_REQUIRE_APPLY_HPP
 #define TAO_PEGTL_REQUIRE_APPLY_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/require_apply0.hpp b/packages/PEGTL/include/tao/pegtl/require_apply0.hpp
index 368f9b37d26a213ef23962d283e5d0aa845ef940..a20f0a21085a4db72278511ecc1b2df93c3dc936 100644
--- a/packages/PEGTL/include/tao/pegtl/require_apply0.hpp
+++ b/packages/PEGTL/include/tao/pegtl/require_apply0.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_REQUIRE_APPLY0_HPP
 #define TAO_PEGTL_REQUIRE_APPLY0_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/rewind_mode.hpp b/packages/PEGTL/include/tao/pegtl/rewind_mode.hpp
index 48e9dc9528d6f2ce2b671b1a359b2208672a4d7c..b97cbcfaf4e53593c7390b2170c22eb11226371e 100644
--- a/packages/PEGTL/include/tao/pegtl/rewind_mode.hpp
+++ b/packages/PEGTL/include/tao/pegtl/rewind_mode.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2016-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_REWIND_MODE_HPP
 #define TAO_PEGTL_REWIND_MODE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/rules.hpp b/packages/PEGTL/include/tao/pegtl/rules.hpp
index 675098833215d9b2e075d370d7487d9b10f158a8..59d73c742fae9131514ecd01aa421a71a3191ba5 100644
--- a/packages/PEGTL/include/tao/pegtl/rules.hpp
+++ b/packages/PEGTL/include/tao/pegtl/rules.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_RULES_HPP
 #define TAO_PEGTL_RULES_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/string_input.hpp b/packages/PEGTL/include/tao/pegtl/string_input.hpp
index 43349eb6f0e76480ff879cba04eb79eecda54152..f66ee24c8fb8ddd348efeb5393187852e6783274 100644
--- a/packages/PEGTL/include/tao/pegtl/string_input.hpp
+++ b/packages/PEGTL/include/tao/pegtl/string_input.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_STRING_INPUT_HPP
 #define TAO_PEGTL_STRING_INPUT_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/tracking_mode.hpp b/packages/PEGTL/include/tao/pegtl/tracking_mode.hpp
index 18e11f4d11668e96cdd4ae6a0b7a1dcc3eeb94e6..6e0f06b8fd5b74934c4a68d84502366eeaf639ad 100644
--- a/packages/PEGTL/include/tao/pegtl/tracking_mode.hpp
+++ b/packages/PEGTL/include/tao/pegtl/tracking_mode.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_TRACKING_MODE_HPP
 #define TAO_PEGTL_TRACKING_MODE_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/type_list.hpp b/packages/PEGTL/include/tao/pegtl/type_list.hpp
index 243edeb8c15a4229c25729a55505f16d3f6d83cd..f0704aabea50744da880609ee97aeb47eb8b41e2 100644
--- a/packages/PEGTL/include/tao/pegtl/type_list.hpp
+++ b/packages/PEGTL/include/tao/pegtl/type_list.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_TYPE_LIST_HPP
 #define TAO_PEGTL_TYPE_LIST_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/utf8.hpp b/packages/PEGTL/include/tao/pegtl/utf8.hpp
index f0c0a83a5e3accbef3c61fe1d4b9fe300bd48136..95b45ef49ccab20bb6a8b8bab23ad708104f4410 100644
--- a/packages/PEGTL/include/tao/pegtl/utf8.hpp
+++ b/packages/PEGTL/include/tao/pegtl/utf8.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_UTF8_HPP
 #define TAO_PEGTL_UTF8_HPP
diff --git a/packages/PEGTL/include/tao/pegtl/version.hpp b/packages/PEGTL/include/tao/pegtl/version.hpp
index 2ffc6b953b1869b690588655bf26281f84fb68d5..034954416633cf614672005f8047bd3b275a35d4 100644
--- a/packages/PEGTL/include/tao/pegtl/version.hpp
+++ b/packages/PEGTL/include/tao/pegtl/version.hpp
@@ -1,13 +1,14 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_VERSION_HPP
 #define TAO_PEGTL_VERSION_HPP
 
-#define TAO_PEGTL_VERSION "3.2.1"
+#define TAO_PEGTL_VERSION "3.3.0"
 
 #define TAO_PEGTL_VERSION_MAJOR 3
-#define TAO_PEGTL_VERSION_MINOR 2
-#define TAO_PEGTL_VERSION_PATCH 1
+#define TAO_PEGTL_VERSION_MINOR 3
+#define TAO_PEGTL_VERSION_PATCH 0
 
 #endif
diff --git a/packages/PEGTL/include/tao/pegtl/visit.hpp b/packages/PEGTL/include/tao/pegtl/visit.hpp
index dfce2f756650804ad3ba2adddb7186416e894fe4..81ca6ac75cb151d9694b72f15b8f0196f66f1f52 100644
--- a/packages/PEGTL/include/tao/pegtl/visit.hpp
+++ b/packages/PEGTL/include/tao/pegtl/visit.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_VISIT_HPP
 #define TAO_PEGTL_VISIT_HPP
diff --git a/packages/PEGTL/src/example/pegtl/CMakeLists.txt b/packages/PEGTL/src/example/pegtl/CMakeLists.txt
index 58e1f593b2e995f9b27510cbbeb0a79d0f8ffd82..236d365ddc71c1ce53e984963df5abd4c026765e 100644
--- a/packages/PEGTL/src/example/pegtl/CMakeLists.txt
+++ b/packages/PEGTL/src/example/pegtl/CMakeLists.txt
@@ -11,6 +11,7 @@ set(example_sources
   expression.cpp
   hello_world.cpp
   indent_aware.cpp
+  iri.cpp
   json_analyze.cpp
   json_ast.cpp
   json_build.cpp
@@ -25,7 +26,6 @@ set(example_sources
   modulus_match.cpp
   parse_tree.cpp
   parse_tree_user_state.cpp
-  peg2pegtl.cpp
   proto3.cpp
   recover.cpp
   s_expression.cpp
diff --git a/packages/PEGTL/src/example/pegtl/abnf2pegtl.cpp b/packages/PEGTL/src/example/pegtl/abnf2pegtl.cpp
index 90ac59a5188c289e5cfdced53cfd54681ad2860d..3fd3b976941bb5c87770f31c33dbb46f14f886a1 100644
--- a/packages/PEGTL/src/example/pegtl/abnf2pegtl.cpp
+++ b/packages/PEGTL/src/example/pegtl/abnf2pegtl.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <algorithm>
 #include <exception>
diff --git a/packages/PEGTL/src/example/pegtl/analyze.cpp b/packages/PEGTL/src/example/pegtl/analyze.cpp
index 94b4c9aa5546f5c2b5e55771a6a8fdff91bbcc2c..8f7aa0057d4f51ff363936f6dfb70db07993dfa8 100644
--- a/packages/PEGTL/src/example/pegtl/analyze.cpp
+++ b/packages/PEGTL/src/example/pegtl/analyze.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <tao/pegtl.hpp>
 
diff --git a/packages/PEGTL/src/example/pegtl/calculator.cpp b/packages/PEGTL/src/example/pegtl/calculator.cpp
index f69110c06f49a2c56ac8be81a6b05ca41e02dbea..ed9fb2b05512e06196ac23e136457b869c9fbb8d 100644
--- a/packages/PEGTL/src/example/pegtl/calculator.cpp
+++ b/packages/PEGTL/src/example/pegtl/calculator.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <cassert>
 #include <functional>
diff --git a/packages/PEGTL/src/example/pegtl/chomsky_hierarchy.cpp b/packages/PEGTL/src/example/pegtl/chomsky_hierarchy.cpp
index a135d3e50da36aa72754944aff4a628cf822c43e..cb671c0e8894908e1dd3cf8f53bc852a617a2623 100644
--- a/packages/PEGTL/src/example/pegtl/chomsky_hierarchy.cpp
+++ b/packages/PEGTL/src/example/pegtl/chomsky_hierarchy.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <cassert>
 #include <cstring>
diff --git a/packages/PEGTL/src/example/pegtl/csv1.cpp b/packages/PEGTL/src/example/pegtl/csv1.cpp
index 5bcdb2b336d9952c896eb8a0946e6b37020b5ad3..826363fde8d3b6dc213a591e3db7372c8f341444 100644
--- a/packages/PEGTL/src/example/pegtl/csv1.cpp
+++ b/packages/PEGTL/src/example/pegtl/csv1.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2016-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <cassert>
 #include <cstdint>
diff --git a/packages/PEGTL/src/example/pegtl/csv2.cpp b/packages/PEGTL/src/example/pegtl/csv2.cpp
index fdc363634edb3c8d91663b2ab2df50f469fce561..c4aafd56ca8860ec60e9857011b7b71f5f0a8b96 100644
--- a/packages/PEGTL/src/example/pegtl/csv2.cpp
+++ b/packages/PEGTL/src/example/pegtl/csv2.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2016-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <exception>
 #include <iostream>
diff --git a/packages/PEGTL/src/example/pegtl/double.hpp b/packages/PEGTL/src/example/pegtl/double.hpp
index 8ceec7cd1c61727496bd6ebf378084a52d084552..c79f5cc755659155e64db2edfef6be602b476f2a 100644
--- a/packages/PEGTL/src/example/pegtl/double.hpp
+++ b/packages/PEGTL/src/example/pegtl/double.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_SRC_EXAMPLES_PEGTL_DOUBLE_HPP
 #define TAO_PEGTL_SRC_EXAMPLES_PEGTL_DOUBLE_HPP
diff --git a/packages/PEGTL/src/example/pegtl/dynamic_match.cpp b/packages/PEGTL/src/example/pegtl/dynamic_match.cpp
index 73ed8258df709cf2c6f1f6bdc1932ddb5c932d5f..bf1465066e88c21dbcb5703f7a84ca7cb971299e 100644
--- a/packages/PEGTL/src/example/pegtl/dynamic_match.cpp
+++ b/packages/PEGTL/src/example/pegtl/dynamic_match.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <cassert>
 #include <cstring>
diff --git a/packages/PEGTL/src/example/pegtl/expression.cpp b/packages/PEGTL/src/example/pegtl/expression.cpp
index 030b0d6db3f689d996d2e243486fa0930852141c..5602ad3f09fb464f4fff6d963c1239d8496506b6 100644
--- a/packages/PEGTL/src/example/pegtl/expression.cpp
+++ b/packages/PEGTL/src/example/pegtl/expression.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/example/pegtl/hello_world.cpp b/packages/PEGTL/src/example/pegtl/hello_world.cpp
index a15273daf6c28f9f1231aa85a84c62569b2ae01b..8ecbdb813a87e41c1410da79e786acb9e507a9fb 100644
--- a/packages/PEGTL/src/example/pegtl/hello_world.cpp
+++ b/packages/PEGTL/src/example/pegtl/hello_world.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <iostream>
 #include <string>
diff --git a/packages/PEGTL/src/example/pegtl/indent_aware.cpp b/packages/PEGTL/src/example/pegtl/indent_aware.cpp
index 42295a2226777da67bcd71cdc058a0a3c08b964c..1462f89af04646a740182e9554e0b5c965f8585e 100644
--- a/packages/PEGTL/src/example/pegtl/indent_aware.cpp
+++ b/packages/PEGTL/src/example/pegtl/indent_aware.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/example/pegtl/iri.cpp b/packages/PEGTL/src/example/pegtl/iri.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aed3e981bf7630fd85f8ea330b1e5b373b6e317b
--- /dev/null
+++ b/packages/PEGTL/src/example/pegtl/iri.cpp
@@ -0,0 +1,102 @@
+// Copyright (c) 2021 Kelvin Hammond
+// Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
+
+#if !defined( __cpp_exceptions )
+#include <iostream>
+int main()
+{
+   std::cerr << "Exception support required, example unavailable." << std::endl;
+   return 1;
+}
+#else
+
+#include <tao/pegtl.hpp>
+#include <tao/pegtl/contrib/iri.hpp>
+
+#include <iostream>
+
+namespace pegtl = TAO_PEGTL_NAMESPACE;
+
+struct IRI
+{
+   std::string scheme;
+   std::string authority;
+   std::string userinfo;
+   std::string host;
+   std::string port;
+   std::string path;
+   std::string query;
+   std::string fragment;
+
+   explicit IRI( const std::string& iri );
+};
+
+namespace iri
+{
+   template< std::string IRI::*Field >
+   struct bind
+   {
+      template< typename ActionInput >
+      static void apply( const ActionInput& in, IRI& iri )
+      {
+         iri.*Field = in.string();
+      }
+   };
+
+   // clang-format off
+   template< typename Rule > struct action {};
+
+   template<> struct action< pegtl::iri::scheme > : bind< &IRI::scheme > {};
+   template<> struct action< pegtl::iri::iauthority > : bind< &IRI::authority > {};
+   // userinfo: see below
+   template<> struct action< pegtl::iri::ihost > : bind< &IRI::host > {};
+   template<> struct action< pegtl::iri::port > : bind< &IRI::port > {};
+   template<> struct action< pegtl::iri::ipath_noscheme > : bind< &IRI::path > {};
+   template<> struct action< pegtl::iri::ipath_rootless > : bind< &IRI::path > {};
+   template<> struct action< pegtl::iri::ipath_absolute > : bind< &IRI::path > {};
+   template<> struct action< pegtl::iri::ipath_abempty > : bind< &IRI::path > {};
+   template<> struct action< pegtl::iri::iquery > : bind< &IRI::query > {};
+   template<> struct action< pegtl::iri::ifragment > : bind< &IRI::fragment > {};
+   // clang-format on
+
+   template<>
+   struct action< pegtl::iri::opt_iuserinfo >
+   {
+      template< typename ActionInput >
+      static void apply( const ActionInput& in, IRI& iri )
+      {
+         if( !in.empty() ) {
+            iri.userinfo = std::string( in.begin(), in.size() - 1 );
+         }
+      }
+   };
+
+}  // namespace iri
+
+IRI::IRI( const std::string& iri )
+{
+   using grammar = pegtl::must< pegtl::iri::IRI >;
+   pegtl::memory_input input( iri, "iri" );
+   pegtl::parse< grammar, iri::action >( input, *this );
+}
+
+int main( int argc, char** argv )
+{
+   for( int i = 1; i < argc; ++i ) {
+      std::cout << "Parsing " << argv[ i ] << std::endl;
+      const IRI iri( argv[ i ] );
+      std::cout << "IRI.scheme: " << iri.scheme << std::endl;
+      std::cout << "IRI.authority: " << iri.authority << std::endl;
+      std::cout << "IRI.userinfo: " << iri.userinfo << std::endl;
+      std::cout << "IRI.host: " << iri.host << std::endl;
+      std::cout << "IRI.port: " << iri.port << std::endl;
+      std::cout << "IRI.path: " << iri.path << std::endl;
+      std::cout << "IRI.query: " << iri.query << std::endl;
+      std::cout << "IRI.fragment: " << iri.fragment << std::endl;
+   }
+   return 0;
+}
+
+#endif
diff --git a/packages/PEGTL/src/example/pegtl/json_analyze.cpp b/packages/PEGTL/src/example/pegtl/json_analyze.cpp
index ed5c83a2be8961fa3751ed9be20257f9bfb3271e..9a23359a53f694f672c821f78eb24738a729762a 100644
--- a/packages/PEGTL/src/example/pegtl/json_analyze.cpp
+++ b/packages/PEGTL/src/example/pegtl/json_analyze.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <iostream>
 
diff --git a/packages/PEGTL/src/example/pegtl/json_ast.cpp b/packages/PEGTL/src/example/pegtl/json_ast.cpp
index fecaab39a17d51b4d09e989f1d77f148aaade17c..e227ea874f418f034b9de2411cd4b12157ad181c 100644
--- a/packages/PEGTL/src/example/pegtl/json_ast.cpp
+++ b/packages/PEGTL/src/example/pegtl/json_ast.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <iomanip>
 #include <iostream>
diff --git a/packages/PEGTL/src/example/pegtl/json_build.cpp b/packages/PEGTL/src/example/pegtl/json_build.cpp
index c5e607b7c46047df3ddf93ba8122f6be5e7d0e01..00d90a587455f721b7e5541ad9bd454fc6762cc2 100644
--- a/packages/PEGTL/src/example/pegtl/json_build.cpp
+++ b/packages/PEGTL/src/example/pegtl/json_build.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <cassert>
 #include <iomanip>
diff --git a/packages/PEGTL/src/example/pegtl/json_classes.hpp b/packages/PEGTL/src/example/pegtl/json_classes.hpp
index 3e69515b0e35066c3c6b6f141a1a0c91e85a7f11..840804328240a2f5534642682740576bd1e9b97d 100644
--- a/packages/PEGTL/src/example/pegtl/json_classes.hpp
+++ b/packages/PEGTL/src/example/pegtl/json_classes.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_SRC_EXAMPLES_PEGTL_JSON_CLASSES_HPP
 #define TAO_PEGTL_SRC_EXAMPLES_PEGTL_JSON_CLASSES_HPP
diff --git a/packages/PEGTL/src/example/pegtl/json_count.cpp b/packages/PEGTL/src/example/pegtl/json_count.cpp
index 09bf155ef930b3bdc5b28f00798d49f1a7ab0292..f5de1e2ed3b342b84c26faa152ea9760ba2227b0 100644
--- a/packages/PEGTL/src/example/pegtl/json_count.cpp
+++ b/packages/PEGTL/src/example/pegtl/json_count.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <cstddef>
 #include <iomanip>
diff --git a/packages/PEGTL/src/example/pegtl/json_coverage.cpp b/packages/PEGTL/src/example/pegtl/json_coverage.cpp
index 3748eb16003079d866c8b3c952e090564854ece3..99bf55a14ef4d383ec3692fb95e891e038e00b98 100644
--- a/packages/PEGTL/src/example/pegtl/json_coverage.cpp
+++ b/packages/PEGTL/src/example/pegtl/json_coverage.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <iomanip>
 #include <iostream>
diff --git a/packages/PEGTL/src/example/pegtl/json_errors.hpp b/packages/PEGTL/src/example/pegtl/json_errors.hpp
index e0517af74dc6e92a1859f3ec4bb1b74cd4466dc2..5dd28f0ac628f9f6fd7b0faaf46560bcaaa03026 100644
--- a/packages/PEGTL/src/example/pegtl/json_errors.hpp
+++ b/packages/PEGTL/src/example/pegtl/json_errors.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_SRC_EXAMPLES_PEGTL_JSON_ERRORS_HPP
 #define TAO_PEGTL_SRC_EXAMPLES_PEGTL_JSON_ERRORS_HPP
diff --git a/packages/PEGTL/src/example/pegtl/json_parse.cpp b/packages/PEGTL/src/example/pegtl/json_parse.cpp
index 240a8898c38744eff056e5cd885a4464e93450de..cdff8772a535a1a275e5bac85504998063e0e506 100644
--- a/packages/PEGTL/src/example/pegtl/json_parse.cpp
+++ b/packages/PEGTL/src/example/pegtl/json_parse.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <iomanip>
 #include <iostream>
diff --git a/packages/PEGTL/src/example/pegtl/json_print_debug.cpp b/packages/PEGTL/src/example/pegtl/json_print_debug.cpp
index ce7b08862b9474aeff3b70be9ee8a392c4d4b577..8adadfdb0aad76e2b702a1fe9d8ca300c6c13e58 100644
--- a/packages/PEGTL/src/example/pegtl/json_print_debug.cpp
+++ b/packages/PEGTL/src/example/pegtl/json_print_debug.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <iostream>
 
diff --git a/packages/PEGTL/src/example/pegtl/json_print_names.cpp b/packages/PEGTL/src/example/pegtl/json_print_names.cpp
index ffb4a929a559691437acad0d70d37513aa6acc7d..2537b7c06b18fb21e50266acc6fc72c00e913819 100644
--- a/packages/PEGTL/src/example/pegtl/json_print_names.cpp
+++ b/packages/PEGTL/src/example/pegtl/json_print_names.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <iostream>
 
diff --git a/packages/PEGTL/src/example/pegtl/json_trace.cpp b/packages/PEGTL/src/example/pegtl/json_trace.cpp
index 0084722c54fdbb7b8fd603de2f137afb62c2947b..345118a29e0681f845d545883c1f3274808687e0 100644
--- a/packages/PEGTL/src/example/pegtl/json_trace.cpp
+++ b/packages/PEGTL/src/example/pegtl/json_trace.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <iomanip>
 #include <iostream>
diff --git a/packages/PEGTL/src/example/pegtl/json_unescape.hpp b/packages/PEGTL/src/example/pegtl/json_unescape.hpp
index 79c6074c5668a3d8120f4877bad9f59fa3bfc86b..210087e205d9bd27fa2ee19e9a3cadb7f47ad17d 100644
--- a/packages/PEGTL/src/example/pegtl/json_unescape.hpp
+++ b/packages/PEGTL/src/example/pegtl/json_unescape.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_SRC_EXAMPLES_PEGTL_JSON_UNESCAPE_HPP
 #define TAO_PEGTL_SRC_EXAMPLES_PEGTL_JSON_UNESCAPE_HPP
diff --git a/packages/PEGTL/src/example/pegtl/lua53.hpp b/packages/PEGTL/src/example/pegtl/lua53.hpp
index 3bfc953702c4192849e610eeb9978820331ebe52..da57ea822bb17947e8725e32ceabd64ab5a84fa0 100644
--- a/packages/PEGTL/src/example/pegtl/lua53.hpp
+++ b/packages/PEGTL/src/example/pegtl/lua53.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2015-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_SRC_EXAMPLES_PEGTL_LUA53_HPP
 #define TAO_PEGTL_SRC_EXAMPLES_PEGTL_LUA53_HPP
diff --git a/packages/PEGTL/src/example/pegtl/lua53_analyze.cpp b/packages/PEGTL/src/example/pegtl/lua53_analyze.cpp
index 788ee84bac12b2d034f2c3b203e6eed409a07f8e..d785e2a98c2139ef6fddab75cb5b95beceb6e658 100644
--- a/packages/PEGTL/src/example/pegtl/lua53_analyze.cpp
+++ b/packages/PEGTL/src/example/pegtl/lua53_analyze.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2015-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/example/pegtl/lua53_parse.cpp b/packages/PEGTL/src/example/pegtl/lua53_parse.cpp
index 72c1fddabbb34f5cd8beb78d9d4f69cd4108dde9..317e23c47b2454d33fb9d1817e61d51bd35b63a2 100644
--- a/packages/PEGTL/src/example/pegtl/lua53_parse.cpp
+++ b/packages/PEGTL/src/example/pegtl/lua53_parse.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2015-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/example/pegtl/modulus_match.cpp b/packages/PEGTL/src/example/pegtl/modulus_match.cpp
index f3a7d348d50d26ed23b793412e5472e684842e4e..fd866c5a53d167f67f043e833c41d98c38a089f7 100644
--- a/packages/PEGTL/src/example/pegtl/modulus_match.cpp
+++ b/packages/PEGTL/src/example/pegtl/modulus_match.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <tao/pegtl.hpp>
 
diff --git a/packages/PEGTL/src/example/pegtl/parse_tree.cpp b/packages/PEGTL/src/example/pegtl/parse_tree.cpp
index 1a37e3d05bd4dd28b930231e3a27d6a88d063ebd..c105e1621fc06a747f7c07ad5aacb930a14fd647 100644
--- a/packages/PEGTL/src/example/pegtl/parse_tree.cpp
+++ b/packages/PEGTL/src/example/pegtl/parse_tree.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <array>
 #include <iomanip>
diff --git a/packages/PEGTL/src/example/pegtl/parse_tree_user_state.cpp b/packages/PEGTL/src/example/pegtl/parse_tree_user_state.cpp
index 7268bca8a3b5372ee197895e33fe155346bc9004..6bb7a382a26fec64a91d16e6ac8451b66db3a978 100644
--- a/packages/PEGTL/src/example/pegtl/parse_tree_user_state.cpp
+++ b/packages/PEGTL/src/example/pegtl/parse_tree_user_state.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <type_traits>
 
diff --git a/packages/PEGTL/src/example/pegtl/peg.peg b/packages/PEGTL/src/example/pegtl/peg.peg
deleted file mode 100644
index 99a7c12a89466a1bb6d89a1fae70ed8a06dd418c..0000000000000000000000000000000000000000
--- a/packages/PEGTL/src/example/pegtl/peg.peg
+++ /dev/null
@@ -1,37 +0,0 @@
-# Parsing Expression Grammar (PEG) taken from
-# https://pdos.csail.mit.edu/~baford/packrat/popl04/peg-popl04.pdf
-
-# Hierarchical syntax
-Grammar <- Spacing Definition+ EndOfFile
-Definition <- Identifier LEFTARROW Expression
-Expression <- Sequence (SLASH Sequence)*
-Sequence <- Prefix*
-Prefix <- (AND / NOT)? Suffix
-Suffix <- Primary (QUESTION / STAR / PLUS)?
-Primary <- Identifier !LEFTARROW / OPEN Expression CLOSE / Literal / Class / DOT
-
-# Lexical syntax
-Identifier <- IdentStart IdentCont* Spacing
-IdentStart <- [a-zA-Z_]
-IdentCont <- IdentStart / [0-9]
-Literal <- ['] (!['] Char)* ['] Spacing / ["] (!["] Char)* ["] Spacing
-Class <- '[' (!']' Range)* ']' Spacing
-Range <- Char '-' Char / Char
-Char <- '\\' [nrt'"\[\]\\] / '\\' [0-2][0-7][0-7] / '\\' [0-7][0-7]? / !'\\' .
-
-LEFTARROW <- '<-' Spacing
-SLASH <- '/' Spacing
-AND <- '&' Spacing
-NOT <- '!' Spacing
-QUESTION <- '?' Spacing
-STAR <- '*' Spacing
-PLUS <- '+' Spacing
-OPEN <- '(' Spacing
-CLOSE <- ')' Spacing
-DOT <- '.' Spacing
-
-Spacing <- (Space / Comment)*
-Comment <- '#' (!EndOfLine .)* EndOfLine
-Space <- ' ' / '\t' / EndOfLine
-EndOfLine <- '\r\n' / '\n' / '\r'
-EndOfFile <- !.
\ No newline at end of file
diff --git a/packages/PEGTL/src/example/pegtl/peg2pegtl.cpp b/packages/PEGTL/src/example/pegtl/peg2pegtl.cpp
deleted file mode 100644
index 60b2628fbee7e6ae8bb9693ea36d569da26cdcf2..0000000000000000000000000000000000000000
--- a/packages/PEGTL/src/example/pegtl/peg2pegtl.cpp
+++ /dev/null
@@ -1,545 +0,0 @@
-// Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Copyright (c) 2021 Daniel Deptford
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
-
-#include <algorithm>
-#include <iomanip>
-#include <iostream>
-#include <iterator>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include <cassert>
-#include <cctype>
-#include <cstdlib>
-
-#if defined( _MSC_VER )
-#include <string.h>
-#define TAO_PEGTL_STRCASECMP _stricmp
-#else
-#include <strings.h>
-#define TAO_PEGTL_STRCASECMP strcasecmp
-#endif
-
-#include <tao/pegtl.hpp>
-#include <tao/pegtl/contrib/parse_tree.hpp>
-#include <tao/pegtl/contrib/peg.hpp>
-
-namespace TAO_PEGTL_NAMESPACE
-{
-   namespace peg
-   {
-      using node_ptr = std::unique_ptr< parse_tree::node >;
-
-      namespace
-      {
-         std::string prefix = "tao::pegtl::";
-
-         std::unordered_set< std::string > keywords = {
-            "alignas",
-            "alignof",
-            "and",
-            "and_eq",
-            "asm",
-            "auto",
-            "bitand",
-            "bitor",
-            "bool",
-            "break",
-            "case",
-            "catch",
-            "char",
-            "char8_t",
-            "char16_t",
-            "char32_t",
-            "class",
-            "compl",
-            "concept",
-            "const",
-            "consteval",
-            "constexpr",
-            "constinit",
-            "const_cast",
-            "continue",
-            "co_await",
-            "co_return",
-            "co_yield",
-            "decltype",
-            "default",
-            "delete",
-            "do",
-            "double",
-            "dynamic_cast",
-            "else",
-            "enum",
-            "explicit",
-            "export",
-            "extern",
-            "false",
-            "float",
-            "for",
-            "friend",
-            "goto",
-            "if",
-            "inline",
-            "int",
-            "long",
-            "mutable",
-            "namespace",
-            "new",
-            "noexcept",
-            "not",
-            "not_eq",
-            "nullptr",
-            "operator",
-            "or",
-            "or_eq",
-            "private",
-            "protected",
-            "public",
-            "register",
-            "reinterpret_cast",
-            "return",
-            "requires",
-            "short",
-            "signed",
-            "sizeof",
-            "static",
-            "static_assert",
-            "static_cast",
-            "struct",
-            "switch",
-            "template",
-            "this",
-            "thread_local",
-            "throw",
-            "true",
-            "try",
-            "typedef",
-            "typeid",
-            "typename",
-            "union",
-            "unsigned",
-            "using",
-            "virtual",
-            "void",
-            "volatile",
-            "wchar_t",
-            "while",
-            "xor",
-            "xor_eq"
-         };
-
-         using identifiers_t = std::vector< std::string >;
-         identifiers_t identifiers_defined;
-         identifiers_t identifiers;
-
-         identifiers_t::reverse_iterator find_identifier( identifiers_t& r, const std::string& v, const identifiers_t::reverse_iterator& rbegin )
-         {
-            return std::find_if( rbegin, r.rend(), [ & ]( const identifiers_t::value_type& p ) { return TAO_PEGTL_STRCASECMP( p.c_str(), v.c_str() ) == 0; } );
-         }
-
-         identifiers_t::reverse_iterator find_identifier( identifiers_t& r, const std::string& v )
-         {
-            return find_identifier( r, v, r.rbegin() );
-         }
-
-         char char_node_to_char( const node_ptr& n )
-         {
-            const char ch = n->string_view().at( 0 );
-
-            if( ch == '\\' ) {
-               static const std::unordered_map< char, char > mappings( {
-                  { 'n', '\n' },
-                  { 'r', '\r' },
-                  { 't', '\t' },
-                  { '\'', '\'' },
-                  { '\"', '\"' },
-                  { '[', '[' },
-                  { ']', ']' },
-                  { '\\', '\\' },
-               } );
-
-               auto iter = mappings.find( n->string_view().at( 1 ) );
-               if( iter != std::end( mappings ) ) {
-                  return iter->second;
-               }
-
-               return static_cast< char >( std::stoi( n->string().substr( 1 ) ) );
-            }
-
-            return ch;
-         }
-
-         void append_char_node( std::string& s, const node_ptr& n )
-         {
-            if( !s.empty() ) {
-               s += ", ";
-            }
-            s += '\'';
-
-            const char c = char_node_to_char( n );
-
-            static const std::unordered_map< char, std::string > escapes( {
-               { '\b', "b" },
-               { '\f', "f" },
-               { '\n', "n" },
-               { '\r', "r" },
-               { '\t', "t" },
-               { '\v', "v" },
-               { '\\', "\\" },
-               { '\'', "\'" },
-            } );
-
-            auto iter = escapes.find( c );
-            if( iter != std::end( escapes ) ) {
-               s += '\\';
-               s += iter->second;
-            }
-            else {
-               s += c;
-            }
-
-            s += '\'';
-         }
-
-      }  // namespace
-
-#if defined( __cpp_exceptions )
-      // Using must_if<> we define a control class which is used for
-      // the parsing run instead of the default control class.
-      //
-      // This improves the errors reported to the user.
-      //
-      // The following turns local errors into global errors, i.e.
-      // if one of the rules for which a custom error message is
-      // defined fails, it throws a parse_error exception (aka global
-      // failure) instead of returning false (aka local failure).
-
-      // clang-format off
-      template< typename > inline constexpr const char* error_message = nullptr;
-
-      template<> inline constexpr auto error_message< peg::grammar::Char > = "unterminated character literal";
-      template<> inline constexpr auto error_message< peg::grammar::Expression > = "unterminated expression";
-      template<> inline constexpr auto error_message< peg::grammar::Grammar > = "unterminated grammar";
-      template<> inline constexpr auto error_message< peg::grammar::Range > = "unterminated range";
-      // clang-format on
-
-      struct error
-      {
-         template< typename Rule >
-         static constexpr auto message = error_message< Rule >;
-      };
-
-      template< typename Rule >
-      using control = must_if< error >::control< Rule >;
-#else
-      template< typename Rule >
-      using control = normal< Rule >;
-#endif
-
-      // Since we are going to generate a parse tree, we define a
-      // selector that decides which rules will be included in our
-      // parse tree, which rules will be omitted from the parse tree,
-      // and which of the nodes will store the matched content.
-      // Additionally, some nodes will fold when they have exactly
-      // one child node. (see fold_one below)
-
-      template< typename Rule >
-      struct selector
-         : pegtl::parse_tree::selector<
-              Rule,
-              pegtl::parse_tree::store_content::on<
-                 grammar::Definition,
-                 grammar::Prefix,
-                 grammar::Suffix,
-                 grammar::Sequence,
-                 grammar::Expression,
-                 grammar::Class,
-                 grammar::Literal,
-                 grammar::Identifier,
-                 grammar::IdentStart,
-                 grammar::Range,
-                 grammar::Char,
-                 grammar::AND,
-                 grammar::NOT,
-                 grammar::QUESTION,
-                 grammar::STAR,
-                 grammar::PLUS,
-                 grammar::DOT >,
-              pegtl::parse_tree::fold_one::on< grammar::IdentCont > >
-      {
-         template< typename... States >
-         static void transform( node_ptr& n )
-         {
-            // As we use the PEG grammar taken directly from the original PEG
-            // paper, some nodes may have excess content from nodes not included
-            // in the parse tree (e.g. Comment, Space, etc).
-            if( !n->children.empty() ) {
-               n->m_end = n->children.back()->m_end;
-            }
-         }
-      };
-
-      std::string to_string( const node_ptr& n );
-      std::string to_string( const std::vector< node_ptr >& v );
-
-      namespace
-      {
-         std::string get_identifier( const node_ptr& n )
-         {
-            assert( n->is_type< grammar::Identifier >() );
-            std::string v = n->string();
-            std::replace( v.begin(), v.end(), '-', '_' );
-            return v;
-         }
-
-         std::string get_identifier( const node_ptr& n, const bool print_forward_declarations )
-         {
-            std::string v = get_identifier( n );
-            const auto it = find_identifier( identifiers, v );
-            if( it != identifiers.rend() ) {
-               return *it;
-            }
-            if( keywords.count( v ) != 0 || v.find( "__" ) != std::string::npos ) {
-#if defined( __cpp_exceptions )
-               throw parse_error( '\'' + n->string() + "' is a reserved identifier", n->begin() );
-#else
-               std::cerr << '\'' + n->string() + "' is a reserved identifier" << std::endl;
-               std::terminate();
-#endif
-            }
-            if( print_forward_declarations && find_identifier( identifiers_defined, v ) != identifiers_defined.rend() ) {
-               std::cout << "struct " << v << ";\n";
-            }
-            identifiers.push_back( v );
-            return v;
-         }
-
-         std::unordered_map< std::string, parse_tree::node* > previous_identifiers;
-
-      }  // namespace
-
-      template<>
-      struct selector< grammar::Definition >
-         : std::true_type
-      {
-         template< typename... States >
-         static void transform( node_ptr& n )
-         {
-            const auto idname = get_identifier( n->children.front() );
-            assert( n->children.back()->is_type< grammar::Expression >() );
-            if( !previous_identifiers.try_emplace( idname, n.get() ).second ) {
-#if defined( __cpp_exceptions )
-               throw parse_error( "identifier '" + idname + "' is already defined", n->begin() );
-#else
-               std::cerr << "identifier '" + idname + "' is already defined" << std::endl;
-               std::terminate();
-#endif
-            }
-         }
-      };
-
-      // Finally, the generated parse tree for each node is converted to
-      // a C++ source code string.
-
-      struct stringifier
-      {
-         using function_t = std::string ( * )( const node_ptr& n );
-         function_t default_ = nullptr;
-
-         std::unordered_map< std::string_view, function_t > map_;
-
-         template< typename T >
-         void add( const function_t& f )
-         {
-            map_.try_emplace( demangle< T >(), f );
-         }
-
-         std::string operator()( const node_ptr& n ) const
-         {
-            const auto it = map_.find( n->type );
-            if( it != map_.end() ) {
-               return it->second( n );
-            }
-
-            return default_( n );
-         }
-      };
-
-      stringifier make_stringifier()
-      {
-         stringifier nrv;
-         nrv.default_ = []( const node_ptr& n ) -> std::string {
-#if defined( __cpp_exceptions )
-            throw parse_error( "missing to_string() for " + std::string( n->type ), n->begin() );
-#else
-            std::cerr << "missing to_string() for " + std::string( n->type ) << std::endl;
-            std::terminate();
-#endif
-         };
-
-         nrv.add< grammar::Identifier >( []( const node_ptr& n ) { return get_identifier( n, true ); } );
-
-         nrv.add< grammar::Definition >( []( const node_ptr& n ) {
-            return "struct " + get_identifier( n->children.front(), false ) + " : " + to_string( n->children.back() ) + " {};";
-         } );
-
-         nrv.add< grammar::Char >( []( const node_ptr& n ) {
-            std::string s;
-            append_char_node( s, n );
-            return s;
-         } );
-
-         nrv.add< grammar::Sequence >( []( const node_ptr& n ) {
-            if( n->children.size() == 1 ) {
-               return to_string( n->children.front() );
-            }
-
-            return prefix + "seq< " + to_string( n->children ) + " >";
-         } );
-
-         nrv.add< grammar::Expression >( []( const node_ptr& n ) {
-            if( n->children.size() == 1 ) {
-               return to_string( n->children.front() );
-            }
-
-            return prefix + "sor< " + to_string( n->children ) + " >";
-         } );
-
-         nrv.add< grammar::Range >( []( const node_ptr& n ) {
-            if( n->children.size() == 1 ) {
-               return prefix + "one< " + to_string( n->children.front() ) + " >";
-            }
-
-            return prefix + "range< " + to_string( n->children.front() ) + ", " + to_string( n->children.back() ) + " >";
-         } );
-
-         nrv.add< grammar::Class >( []( const node_ptr& n ) {
-            if( n->children.size() == 1 ) {
-               return to_string( n->children.front() );
-            }
-
-            return prefix + "sor < " + to_string( n->children ) + " >";
-         } );
-
-         nrv.add< grammar::Literal >( []( const node_ptr& n ) {
-            if( n->children.size() == 1 ) {
-               return prefix + "one< " + to_string( n->children.front() ) + " >";
-            }
-
-            return prefix + "string< " + to_string( n->children ) + " >";
-         } );
-
-         nrv.add< grammar::Prefix >( []( const node_ptr& n ) {
-            auto sub = to_string( n->children.back() );
-
-            if( n->children.front()->is_type< grammar::AND >() ) {
-               return prefix + "at< " + sub + " >";
-            }
-
-            if( n->children.front()->is_type< grammar::NOT >() ) {
-               return prefix + "not_at< " + sub + " >";
-            }
-
-            assert( n->children.size() == 1 );
-            return sub;
-         } );
-
-         nrv.add< grammar::Suffix >( []( const node_ptr& n ) {
-            auto sub = to_string( n->children.front() );
-
-            if( n->children.back()->is_type< grammar::QUESTION >() ) {
-               return prefix + "opt< " + sub + " >";
-            }
-
-            if( n->children.back()->is_type< grammar::STAR >() ) {
-               return prefix + "star< " + sub + " >";
-            }
-
-            if( n->children.back()->is_type< grammar::PLUS >() ) {
-               return prefix + "plus< " + sub + " >";
-            }
-
-            assert( n->children.size() == 1 );
-            return sub;
-         } );
-
-         nrv.add< grammar::DOT >( []( const node_ptr& /*unused*/ ) {
-            return prefix + "any";
-         } );
-
-         return nrv;
-      }
-
-      std::string to_string( const node_ptr& n )
-      {
-         static stringifier s = make_stringifier();
-         return s( n );
-      }
-
-      std::string to_string( const std::vector< node_ptr >& v )
-      {
-         std::string result;
-         for( const auto& c : v ) {
-            if( !result.empty() ) {
-               result += ", ";
-            }
-            result += to_string( c );
-         }
-         return result;
-      }
-
-   }  // namespace peg
-
-}  // namespace TAO_PEGTL_NAMESPACE
-
-int main( int argc, char** argv )  // NOLINT(bugprone-exception-escape)
-{
-   using namespace TAO_PEGTL_NAMESPACE;
-
-   if( argc != 2 ) {
-      std::cerr << "Usage: " << argv[ 0 ] << " SOURCE\n";
-      return 1;
-   }
-
-   file_input in( argv[ 1 ] );
-#if defined( __cpp_exceptions )
-   try {
-      const auto root = parse_tree::parse< peg::grammar::Grammar, peg::selector, nothing, peg::control >( in );
-
-      for( const auto& definition : root->children ) {
-         peg::identifiers_defined.push_back( peg::get_identifier( definition->children.front() ) );
-      }
-
-      for( const auto& rule : root->children ) {
-         std::cout << peg::to_string( rule ) << '\n';
-      }
-   }
-   catch( const parse_error& e ) {
-      const auto p = e.positions().front();
-      std::cerr << e.what() << '\n'
-                << in.line_at( p ) << '\n'
-                << std::setw( p.column ) << '^' << '\n';
-   }
-#else
-   if( const auto root = parse_tree::parse< peg::grammar::Grammar, peg::selector, nothing, peg::control >( in ) ) {
-      for( const auto& definition : root->children ) {
-         peg::identifiers_defined.push_back( peg::get_identifier( definition->children.front() ) );
-      }
-
-      for( const auto& rule : root->children ) {
-         std::cout << peg::to_string( rule ) << '\n';
-      }
-   }
-   else {
-      std::cerr << "error occurred" << std::endl;
-      return 1;
-   }
-#endif
-   return 0;
-}
diff --git a/packages/PEGTL/src/example/pegtl/proto3.cpp b/packages/PEGTL/src/example/pegtl/proto3.cpp
index 39d126fa95e33cae1ced9676f317da0cab175a21..cb7303787ccc7be48c9e5cc4887b9f511264d974 100644
--- a/packages/PEGTL/src/example/pegtl/proto3.cpp
+++ b/packages/PEGTL/src/example/pegtl/proto3.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
@@ -14,132 +15,7 @@ int main()
 
 #include <tao/pegtl.hpp>
 #include <tao/pegtl/contrib/analyze.hpp>
-
-namespace TAO_PEGTL_NAMESPACE::proto3
-{
-   // clang-format off
-   struct comment_sl : seq< two< '/' >, until< eolf > > {};
-   struct comment_ml : seq< one< '/' >, one< '*' >, until< seq< one< '*' >, one< '/' > > > > {};
-   struct sp : sor< space, comment_sl, comment_ml > {};
-   struct sps : star< sp > {};
-
-   struct comma : one< ',' > {};
-   struct dot : one< '.' > {};
-   struct equ : one< '=' > {};
-   struct semi : one< ';' > {};
-
-   struct option;
-   struct message;
-   struct extend;
-
-   struct odigit : range< '0', '7' > {};
-
-   struct ident_first : ranges< 'a', 'z', 'A', 'Z' > {};  // NOTE: Yes, no '_'.
-   struct ident_other : ranges< 'a', 'z', 'A', 'Z', '0', '9', '_' > {};
-   struct ident : seq< ident_first, star< ident_other > > {};
-   struct full_ident : list_must< ident, dot > {};
-
-   struct sign : one< '+', '-' > {};
-   struct oct_lit : seq< one< '0' >, plus< odigit > > {};
-   struct hex_lit : seq< one< '0' >, one< 'x', 'X' >, plus< xdigit > > {};
-   struct dec_lit : sor< one< '0' >, seq< opt< sign >, range< '1', '9' >, star< digit > > >  {};
-   struct int_lit : sor< dec_lit, hex_lit, oct_lit > {};
-
-   struct hex_escape : if_must< one< 'x', 'X' >, xdigit, xdigit > {};
-   struct oct_escape : if_must< odigit, odigit, odigit > {};
-   struct char_escape : one< 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"' > {};
-   struct escape : if_must< one< '\\' >, hex_escape, oct_escape, char_escape > {};
-   struct char_value : sor< escape, not_one< '\n', '\0' > > {};  // NOTE: No need to exclude '\' from not_one<>, see escape rule.
-   template< char Q >
-   struct str_impl : if_must< one< Q >, until< one< Q >, char_value > > {};
-   struct str_lit : sor< str_impl< '\'' >, str_impl< '"' > > {};
-
-   struct bool_lit : sor< keyword< 't', 'r', 'u', 'e' >, keyword< 'f', 'a', 'l', 's', 'e' > > {};
-
-   struct exp : seq < one <'E', 'e'>, opt< sign >, plus< digit > > {};
-   struct float_lit_1 : seq< plus< digit >, dot, star< digit >, opt< exp > > {};
-   struct float_lit_2 : seq< dot, plus< digit >, opt< exp > > {};
-   struct float_lit_3 : seq< plus< digit >, exp > {};
-   struct float_lit : sor < seq< opt<sign>, sor< float_lit_1, float_lit_2, float_lit_3, keyword< 'i', 'n', 'f' > > >, keyword< 'n', 'a', 'n' > > {};
-
-   struct constant : sor< bool_lit, full_ident, float_lit, int_lit, str_lit > {};
-
-   struct option_name : seq< sor< ident, if_must< one< '(' >, full_ident, one< ')' > > >, star_must< dot, ident > > {};
-   struct option : if_must< keyword< 'o', 'p', 't', 'i', 'o', 'n' >, sps, option_name, sps, equ, sps, constant, sps, semi > {};
-
-   struct bool_type : keyword< 'b', 'o', 'o', 'l' > {};
-   struct bytes_type : keyword< 'b', 'y', 't', 'e', 's' > {};
-   struct double_type : keyword< 'd', 'o', 'u', 'b', 'l', 'e' > {};
-   struct float_type : keyword< 'f', 'l', 'o', 'a', 't' > {};
-   struct string_type : keyword< 's', 't', 'r', 'i', 'n', 'g' > {};
-
-   struct int32_type : keyword< 'i', 'n', 't', '3', '2' > {};
-   struct int64_type : keyword< 'i', 'n', 't', '6', '4' > {};
-   struct sint32_type : keyword< 's', 'i', 'n', 't', '3', '2' > {};
-   struct sint64_type : keyword< 's', 'i', 'n', 't', '6', '4' > {};
-   struct uint32_type : keyword< 'u', 'i', 'n', 't', '3', '2' > {};
-   struct uint64_type : keyword< 'u', 'i', 'n', 't', '6', '4' > {};
-   struct fixed32_type : keyword< 'f', 'i', 'x', 'e', 'd', '3', '2' > {};
-   struct fixed64_type : keyword< 'f', 'i', 'x', 'e', 'd', '6', '4' > {};
-   struct sfixed32_type : keyword< 's', 'f', 'i', 'x', 'e', 'd', '3', '2' > {};
-   struct sfixed64_type : keyword< 's', 'f', 'i', 'x', 'e', 'd', '6', '4' > {};
-
-   struct builtin_type : sor< bool_type, bytes_type, double_type, float_type, string_type, int32_type, int64_type, sint32_type, sint64_type, uint32_type, uint64_type, fixed32_type, fixed64_type, sfixed32_type, sfixed64_type > {};
-
-   struct defined_type : seq< opt< dot >, full_ident > {};  // NOTE: This replaces both message_type and enum_type -- they have the same syntax.
-
-   struct type : sor< builtin_type, defined_type > {};
-
-   struct field_option : if_must< option_name, sps, equ, sps, constant > {};
-   struct field_options : if_must< one< '[' >, sps, list< field_option, comma, sp >, sps, one< ']' > > {};
-   struct field_name : ident {};
-   struct field_number : int_lit {};
-   struct field : seq< opt< sor < keyword< 'o', 'p', 't', 'i', 'o', 'n', 'a', 'l' >, keyword< 'r', 'e', 'p', 'e', 'a', 't', 'e', 'd' > >, sps >, type, sps, field_name, sps, equ, sps, field_number, sps, opt< field_options, sps >, semi > {};
-
-   struct oneof_name : ident {};
-   struct oneof_field : if_must< type, sps, field_name, sps, equ, sps, field_number, sps, opt< field_options, sps >, semi > {};
-   struct oneof_body : sor< oneof_field, semi > {};
-   struct oneof : if_must< keyword< 'o', 'n', 'e', 'o', 'f' >, sps, oneof_name, sps, one< '{' >, sps, until< one< '}' >, oneof_body, sps >, sps > {};
-
-   struct key_type : seq< sor< bool_type, string_type, int32_type, int64_type, sint32_type, sint64_type, uint32_type, uint64_type, fixed32_type, fixed64_type, sfixed32_type, sfixed64_type >, not_at< ident_other > > {};
-   struct map_name : ident {};
-   struct map_field : if_must< keyword< 'm', 'a', 'p' >, sps, one< '<' >, sps, key_type, sps, comma, sps, type, sps, one< '>' >, sps, map_name, sps, equ, sps, field_number, sps, opt< field_options, sps >, semi > {};
-
-   struct range : if_must< int_lit, sps, keyword< 't', 'o' >, sps, sor< int_lit, keyword< 'm', 'a', 'x' > > > {};
-   struct ranges : list_must< range, comma, sp > {};
-   struct field_names : list_must< field_name, comma, sp > {};
-   struct reserved : if_must< keyword< 'r', 'e', 's', 'e', 'r', 'v', 'e', 'd' >, sps, sor< ranges, field_names >, sps, semi > {};
-
-   struct enum_name : ident {};
-   struct enum_value_option : seq< option_name, sps, equ, sps, constant > {};
-   struct enum_field : seq< ident, sps, equ, sps, int_lit, sps, opt_must< one< '[' >, sps, list_must< enum_value_option, comma, sp >, sps, one< ']' >, sps >, semi > {};
-   struct enum_body : if_must< one< '{' >, sps, star< sor< option, enum_field, semi >, sps >, one< '}' > > {};
-   struct enum_def : if_must< keyword< 'e', 'n', 'u', 'm' >, sps, enum_name, sps, enum_body > {};
-
-   struct message_thing : sor< field, enum_def, message, option, oneof, map_field, reserved, extend, semi > {};
-   struct message_body : seq< one<'{'>, sps, star< message_thing, sps >, one<'}'> > {};
-   struct message : if_must< keyword< 'm', 'e', 's', 's', 'a', 'g', 'e' >, sps, defined_type, sps, message_body > {};
-   struct extend : if_must< keyword< 'e', 'x', 't', 'e', 'n', 'd' >, sps, defined_type, sps, message_body > {};
-
-   struct package : if_must< keyword< 'p', 'a', 'c', 'k', 'a', 'g', 'e' >, sps, full_ident, sps, semi > {};
-
-   struct import_option : opt< sor< keyword< 'w', 'e', 'a', 'k' >, keyword< 'p', 'u', 'b', 'l', 'i', 'c' > > > {};
-   struct import : if_must< keyword< 'i', 'm', 'p', 'o', 'r', 't' >, sps, import_option, sps, str_lit, sps, semi > {};
-
-   struct rpc_name : ident {};
-   struct rpc_type : if_must< one< '(' >, sps, opt< keyword< 's', 't', 'r', 'e', 'a', 'm' >, sps >, defined_type, sps, one< ')' > > {};
-   struct rpc_options : if_must< one< '{' >, sps, star< sor< option, semi >, sps >, one< '}' > > {};
-   struct rpc : if_must< keyword< 'r', 'p', 'c' >, sps, rpc_name, sps, rpc_type, sps, keyword< 'r', 'e', 't', 'u', 'r', 'n', 's' >, sps, rpc_type, sps, sor< semi, rpc_options > > {};
-   struct service_name : ident {};
-   struct service : if_must< keyword< 's', 'e', 'r', 'v', 'i', 'c', 'e' >, sps, service_name, sps, one< '{' >, sps, star< sor< option, rpc, semi >, sps >, one< '}' > > {};
-
-   struct body : sor< import, package, option, message, enum_def, service, extend, semi > {};
-
-   struct head : if_must< keyword< 's', 'y', 'n', 't', 'a', 'x' >, sps, equ, sps, string< '"', 'p', 'r', 'o', 't', 'o', '3', '"' >, sps, semi > {};
-   struct proto : must< sps, head, sps, star< body, sps >, eof > {};
-   // clang-format on
-
-}  // namespace TAO_PEGTL_NAMESPACE::proto3
+#include <tao/pegtl/contrib/proto3.hpp>
 
 int main( int argc, char** argv )  // NOLINT(bugprone-exception-escape)
 {
diff --git a/packages/PEGTL/src/example/pegtl/recover.cpp b/packages/PEGTL/src/example/pegtl/recover.cpp
index 15baabd5e543bde94ad412b5ecdd1f201e1649e4..9f6e2076799d5b8e34c58eba022f10340c3f72a9 100644
--- a/packages/PEGTL/src/example/pegtl/recover.cpp
+++ b/packages/PEGTL/src/example/pegtl/recover.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 // This is a small experiment with a grammar that can recover from errors.
 //
diff --git a/packages/PEGTL/src/example/pegtl/s_expression.cpp b/packages/PEGTL/src/example/pegtl/s_expression.cpp
index 28dcace3a280998415d7b1aa2fe713ed244dac61..d0ab218c5928e8c326b3bdf81a22c4108aa84690 100644
--- a/packages/PEGTL/src/example/pegtl/s_expression.cpp
+++ b/packages/PEGTL/src/example/pegtl/s_expression.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/example/pegtl/sum.cpp b/packages/PEGTL/src/example/pegtl/sum.cpp
index 6a17863cacafb27d70f62bf7e804c77037002326..9867e3ab84475b9a2c06fbd52ea791d46e36beff 100644
--- a/packages/PEGTL/src/example/pegtl/sum.cpp
+++ b/packages/PEGTL/src/example/pegtl/sum.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <cstdlib>
 #include <iostream>
diff --git a/packages/PEGTL/src/example/pegtl/symbol_table.cpp b/packages/PEGTL/src/example/pegtl/symbol_table.cpp
index 673174c9b88e8033b8c7db4d118e6161ca932d7a..5dba51e3b91b1ebab816d25d759854b3e1286a90 100644
--- a/packages/PEGTL/src/example/pegtl/symbol_table.cpp
+++ b/packages/PEGTL/src/example/pegtl/symbol_table.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/example/pegtl/token_input.cpp b/packages/PEGTL/src/example/pegtl/token_input.cpp
index 7d2795a3fb9bf453bbb3aa222ab2185d2a2b9517..df35d4aafdbc061753dd69ca6d75a79b766a6ee3 100644
--- a/packages/PEGTL/src/example/pegtl/token_input.cpp
+++ b/packages/PEGTL/src/example/pegtl/token_input.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <iostream>
 #include <string>
diff --git a/packages/PEGTL/src/example/pegtl/unescape.cpp b/packages/PEGTL/src/example/pegtl/unescape.cpp
index f445435b112989cf8215bfa7e0dbae0b2983e012..10f2a1f4071517108dad344cef0a938ec384264b 100644
--- a/packages/PEGTL/src/example/pegtl/unescape.cpp
+++ b/packages/PEGTL/src/example/pegtl/unescape.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <iostream>
 
diff --git a/packages/PEGTL/src/example/pegtl/uri.cpp b/packages/PEGTL/src/example/pegtl/uri.cpp
index 413caf99e6349678ec116735972cd4b8f3d8453e..a964959c41d1b7e9840eceb1b2f9c2ebc737f8da 100644
--- a/packages/PEGTL/src/example/pegtl/uri.cpp
+++ b/packages/PEGTL/src/example/pegtl/uri.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/example/pegtl/uri_print_debug.cpp b/packages/PEGTL/src/example/pegtl/uri_print_debug.cpp
index 10e42f2433cbfc6c9afbca6ed5ebd55330b55401..f535e3a11b5c070e95ad64a3b9511b65a5b989c6 100644
--- a/packages/PEGTL/src/example/pegtl/uri_print_debug.cpp
+++ b/packages/PEGTL/src/example/pegtl/uri_print_debug.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/example/pegtl/uri_print_names.cpp b/packages/PEGTL/src/example/pegtl/uri_print_names.cpp
index 708c3e71113b6844da410a7d7b46926c61d58b96..95361921756a6d13d4c4a0b73ba6f533e6bf1097 100644
--- a/packages/PEGTL/src/example/pegtl/uri_print_names.cpp
+++ b/packages/PEGTL/src/example/pegtl/uri_print_names.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/example/pegtl/uri_trace.cpp b/packages/PEGTL/src/example/pegtl/uri_trace.cpp
index e1392f85a54f967ffa563b89e82f2530c97d9214..d041d3343a468a1c651fbd6d09edc3e354b6f20a 100644
--- a/packages/PEGTL/src/example/pegtl/uri_trace.cpp
+++ b/packages/PEGTL/src/example/pegtl/uri_trace.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/test/pegtl/CMakeLists.txt b/packages/PEGTL/src/test/pegtl/CMakeLists.txt
index 6bfacb4a72caec4e1dd08bdaade1d934e3bac9e0..eae7a9546c6fa5b0cb113cb999788a6cbbd828cc 100644
--- a/packages/PEGTL/src/test/pegtl/CMakeLists.txt
+++ b/packages/PEGTL/src/test/pegtl/CMakeLists.txt
@@ -34,6 +34,7 @@ set(test_sources
   contrib_if_then.cpp
   contrib_instantiate.cpp
   contrib_integer.cpp
+  contrib_iri.cpp
   contrib_json.cpp
   contrib_parse_tree.cpp
   contrib_parse_tree_to_dot.cpp
@@ -45,6 +46,7 @@ set(test_sources
   contrib_remove_last_states.cpp
   contrib_rep_one_min_max.cpp
   contrib_rep_string.cpp
+  contrib_separated_seq.cpp
   contrib_state_control.cpp
   contrib_to_string.cpp
   contrib_trace1.cpp
diff --git a/packages/PEGTL/src/test/pegtl/action_enable.cpp b/packages/PEGTL/src/test/pegtl/action_enable.cpp
index eaead46a3d235a9c6ac867392e5d6b72435680b0..3f7a5cb4228da53c3ec04d1b4938925631b38c22 100644
--- a/packages/PEGTL/src/test/pegtl/action_enable.cpp
+++ b/packages/PEGTL/src/test/pegtl/action_enable.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/action_match.cpp b/packages/PEGTL/src/test/pegtl/action_match.cpp
index f8f448c44fb1cce6fa4939550b4cd888e838d291..af6f5709d0902103073a114981bc0cb065fe641b 100644
--- a/packages/PEGTL/src/test/pegtl/action_match.cpp
+++ b/packages/PEGTL/src/test/pegtl/action_match.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/actions_one.cpp b/packages/PEGTL/src/test/pegtl/actions_one.cpp
index 9e8998c3050b023150ca1351e7db223cb5307319..4b0f4b6fda87e735883ec3e2f5a27c586f7dbbf6 100644
--- a/packages/PEGTL/src/test/pegtl/actions_one.cpp
+++ b/packages/PEGTL/src/test/pegtl/actions_one.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/actions_three.cpp b/packages/PEGTL/src/test/pegtl/actions_three.cpp
index 8f9fa9b351026eeddd2137253ca72242a013dff7..987f764578a77f2c8d9afdcd1983b968d927b318 100644
--- a/packages/PEGTL/src/test/pegtl/actions_three.cpp
+++ b/packages/PEGTL/src/test/pegtl/actions_three.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/actions_two.cpp b/packages/PEGTL/src/test/pegtl/actions_two.cpp
index 40f736757f87ad9ca519800a5a2b4c91a6e4e83e..77687875c6755d5f8558a3fc4f6445ac69ecf2b7 100644
--- a/packages/PEGTL/src/test/pegtl/actions_two.cpp
+++ b/packages/PEGTL/src/test/pegtl/actions_two.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/argv_input.cpp b/packages/PEGTL/src/test/pegtl/argv_input.cpp
index 933e29c06d71351a1401bc8c3388222749b2e5db..0f3c283d20b99ae08f1e94a265a4f7c0ffd440c4 100644
--- a/packages/PEGTL/src/test/pegtl/argv_input.cpp
+++ b/packages/PEGTL/src/test/pegtl/argv_input.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <cstring>
 
diff --git a/packages/PEGTL/src/test/pegtl/ascii_classes.cpp b/packages/PEGTL/src/test/pegtl/ascii_classes.cpp
index 2989634c00709cdcba9c79f14357dd046436e1ab..f9e255c33f11bbe7e47e6f879a8234271faf061d 100644
--- a/packages/PEGTL/src/test/pegtl/ascii_classes.cpp
+++ b/packages/PEGTL/src/test/pegtl/ascii_classes.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_char.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/ascii_eol.cpp b/packages/PEGTL/src/test/pegtl/ascii_eol.cpp
index 851eba98f5009ac9ec68bd4120ea85558729f98d..0169eda66cd94f2794254e7990b6dd7e4c93ced5 100644
--- a/packages/PEGTL/src/test/pegtl/ascii_eol.cpp
+++ b/packages/PEGTL/src/test/pegtl/ascii_eol.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2016-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_char.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/ascii_eolf.cpp b/packages/PEGTL/src/test/pegtl/ascii_eolf.cpp
index c0016df14de850706cf4cbe9bc9c2e03fe0c4fc8..c6033080c516ba6b6eeebd9755511834f9067c6d 100644
--- a/packages/PEGTL/src/test/pegtl/ascii_eolf.cpp
+++ b/packages/PEGTL/src/test/pegtl/ascii_eolf.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_char.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/ascii_forty_two.cpp b/packages/PEGTL/src/test/pegtl/ascii_forty_two.cpp
index efcafff31a8e8f646a76f4651ddd5abad1b4822b..bf3edcbdc123c1b33e39f655f05cd15ec51f8d22 100644
--- a/packages/PEGTL/src/test/pegtl/ascii_forty_two.cpp
+++ b/packages/PEGTL/src/test/pegtl/ascii_forty_two.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/ascii_identifier.cpp b/packages/PEGTL/src/test/pegtl/ascii_identifier.cpp
index 2e97e5851660647a367a4e2cfdcea2661d791ecd..8c1a14b852f6d3ea69a194ff625f7eb6b80be8e6 100644
--- a/packages/PEGTL/src/test/pegtl/ascii_identifier.cpp
+++ b/packages/PEGTL/src/test/pegtl/ascii_identifier.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/ascii_istring.cpp b/packages/PEGTL/src/test/pegtl/ascii_istring.cpp
index a5a74e8b84e752222b5f373547ce64eaf63a9926..3f1b1ff09aa1aa19dffc33e55dc256d80baaeaa7 100644
--- a/packages/PEGTL/src/test/pegtl/ascii_istring.cpp
+++ b/packages/PEGTL/src/test/pegtl/ascii_istring.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/ascii_keyword.cpp b/packages/PEGTL/src/test/pegtl/ascii_keyword.cpp
index d4b6dd128622d17847510e9064d025c3865a53ad..5d384c5f120f3eb644533944031d14875058bef6 100644
--- a/packages/PEGTL/src/test/pegtl/ascii_keyword.cpp
+++ b/packages/PEGTL/src/test/pegtl/ascii_keyword.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/ascii_shebang.cpp b/packages/PEGTL/src/test/pegtl/ascii_shebang.cpp
index 3f030ee5e8f98bbe904a5e27789eb62df12b7153..71fc843daa3365e3c2eda3f4c7c7b7e273a87f00 100644
--- a/packages/PEGTL/src/test/pegtl/ascii_shebang.cpp
+++ b/packages/PEGTL/src/test/pegtl/ascii_shebang.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/ascii_string.cpp b/packages/PEGTL/src/test/pegtl/ascii_string.cpp
index 9712d3132c547be18520f65ee2e20000fef14527..e128dca6a002a1f92d10227996d37292da275207 100644
--- a/packages/PEGTL/src/test/pegtl/ascii_string.cpp
+++ b/packages/PEGTL/src/test/pegtl/ascii_string.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/ascii_three.cpp b/packages/PEGTL/src/test/pegtl/ascii_three.cpp
index 2b0b1c03e6e5bf6f6b9d220cceb3fde2c4ed5eb1..68a026debfec176c96238da99298b896669b1610 100644
--- a/packages/PEGTL/src/test/pegtl/ascii_three.cpp
+++ b/packages/PEGTL/src/test/pegtl/ascii_three.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/ascii_two.cpp b/packages/PEGTL/src/test/pegtl/ascii_two.cpp
index 5326620bdbfa59fca923239148bf68d82ea5d3b0..8c0ff00973cd87f0bf28eb3108aa3ac10e4d5a80 100644
--- a/packages/PEGTL/src/test/pegtl/ascii_two.cpp
+++ b/packages/PEGTL/src/test/pegtl/ascii_two.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/buffer_input.cpp b/packages/PEGTL/src/test/pegtl/buffer_input.cpp
index e5ab52ba1d502b34ec20923ea59787fe187e7fd3..eff7614e5f347d52cc942487ecf8856643f94e70 100644
--- a/packages/PEGTL/src/test/pegtl/buffer_input.cpp
+++ b/packages/PEGTL/src/test/pegtl/buffer_input.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <string>
 
diff --git a/packages/PEGTL/src/test/pegtl/change_action_and_state.cpp b/packages/PEGTL/src/test/pegtl/change_action_and_state.cpp
index a631f9dc02016cc9267d794f0a9032d08a7f0751..9cf3cce155c0ce5ab17966ad50bb2637f989fcf5 100644
--- a/packages/PEGTL/src/test/pegtl/change_action_and_state.cpp
+++ b/packages/PEGTL/src/test/pegtl/change_action_and_state.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/change_action_and_states.cpp b/packages/PEGTL/src/test/pegtl/change_action_and_states.cpp
index 4bd4f5ffec701333d7bd50767708fd342b77703d..9ea7faa781b2f2aad0ce01633e4e15908dc3d7e0 100644
--- a/packages/PEGTL/src/test/pegtl/change_action_and_states.cpp
+++ b/packages/PEGTL/src/test/pegtl/change_action_and_states.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/change_state.cpp b/packages/PEGTL/src/test/pegtl/change_state.cpp
index bc7840256a7f884b87841cbaad5df6d0d7390743..6061b9178cc9e0ec0a3e5ebcfa2182a5d8a051d8 100644
--- a/packages/PEGTL/src/test/pegtl/change_state.cpp
+++ b/packages/PEGTL/src/test/pegtl/change_state.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/change_states.cpp b/packages/PEGTL/src/test/pegtl/change_states.cpp
index 0dc0f533f90a15ffc727d496c6f348b4dcfcd405..64a4b27620d241d9e23d02860b8dd972e8a66773 100644
--- a/packages/PEGTL/src/test/pegtl/change_states.cpp
+++ b/packages/PEGTL/src/test/pegtl/change_states.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/check_bytes.cpp b/packages/PEGTL/src/test/pegtl/check_bytes.cpp
index b773abed9b0d168bc4502b779617a38a5a3051b0..d5dc2ad00eb8e9c666f352df8a9cf674f59aa021 100644
--- a/packages/PEGTL/src/test/pegtl/check_bytes.cpp
+++ b/packages/PEGTL/src/test/pegtl/check_bytes.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <tao/pegtl/contrib/check_bytes.hpp>
 
diff --git a/packages/PEGTL/src/test/pegtl/contains.cpp b/packages/PEGTL/src/test/pegtl/contains.cpp
index 68b24b73cd3a6e9cea1b3427028e25bd899aa9ad..ee1d9a6fb2a9b57d86c87af48d6fb966354c5d62 100644
--- a/packages/PEGTL/src/test/pegtl/contains.cpp
+++ b/packages/PEGTL/src/test/pegtl/contains.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <type_traits>
 
diff --git a/packages/PEGTL/src/test/pegtl/contrib_alphabet.cpp b/packages/PEGTL/src/test/pegtl/contrib_alphabet.cpp
index 1aac0e762a06e36dd7c209ab757578b57606c6b6..73fdecf224063ac5e1e0f3900b35c3b459238f72 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_alphabet.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_alphabet.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2015-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/contrib_analyze.cpp b/packages/PEGTL/src/test/pegtl/contrib_analyze.cpp
index fec49897a614ee73ee3f8a55b3271869ffdb376c..cce72b544c35918843e7707def84b4f28019709c 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_analyze.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_analyze.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/contrib_control_action.cpp b/packages/PEGTL/src/test/pegtl/contrib_control_action.cpp
index 8249dc6008be2fd76fb9058ff7f0d6d7b7dee987..512d1ba541fcccb93543b5c3b0cd3747497f9386 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_control_action.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_control_action.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <string>
 
diff --git a/packages/PEGTL/src/test/pegtl/contrib_coverage.cpp b/packages/PEGTL/src/test/pegtl/contrib_coverage.cpp
index c9661624facc95358e6635f51d73582b1ec2380c..b70954e24ec22f59ecf26775aa8d703e6461cd75 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_coverage.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_coverage.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <iostream>
 
diff --git a/packages/PEGTL/src/test/pegtl/contrib_function.cpp b/packages/PEGTL/src/test/pegtl/contrib_function.cpp
index 3191981de18fbaa6830d203d75edb6969fb7b9ae..3455fe18cc4046c29bb47bfda689ff5dc2ce0e42 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_function.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_function.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/contrib_http.cpp b/packages/PEGTL/src/test/pegtl/contrib_http.cpp
index 3fb449d0ae3a17adbfe2694d302159a8775ab097..3bdb52d59c4fa9d9756b5ecf5189fb45da75d8b4 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_http.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_http.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/test/pegtl/contrib_if_then.cpp b/packages/PEGTL/src/test/pegtl/contrib_if_then.cpp
index ceb60d6029c9ed7a41118699c6cb344b86c3ec45..b98a3ba598dababe65395bf4e85653b369373cc5 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_if_then.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_if_then.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_rule.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/contrib_instantiate.cpp b/packages/PEGTL/src/test/pegtl/contrib_instantiate.cpp
index fc85dec752d01633ddf3bf9f45642d84b4c896ea..86566822b095f1465fd9ca848e365c7c4bba89da 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_instantiate.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_instantiate.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/contrib_integer.cpp b/packages/PEGTL/src/test/pegtl/contrib_integer.cpp
index d08182a9c7e400aee001fde49d9c6ae5dc9fb9a9..f3441c0ea0e199e58083c8426d8589ef555def94 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_integer.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_integer.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
@@ -173,11 +174,11 @@ namespace TAO_PEGTL_NAMESPACE
       test_unsigned< unsigned char >( "000256" );
 
       test_signed< signed long long >( "0", 0 );
-      test_signed< signed long long >( (std::numeric_limits< signed long long >::max)() );
-      test_signed< signed long long >( (std::numeric_limits< signed long long >::min)() );
+      test_signed< signed long long >( ( std::numeric_limits< signed long long >::max )() );
+      test_signed< signed long long >( ( std::numeric_limits< signed long long >::min )() );
 
       test_unsigned< unsigned long long >( "0", 0 );
-      test_unsigned< unsigned long long >( (std::numeric_limits< unsigned long long >::max)() );
+      test_unsigned< unsigned long long >( ( std::numeric_limits< unsigned long long >::max )() );
 
       verify_rule< max_seq_rule< 0 > >( __LINE__, __FILE__, "a0b", result_type::success );
       verify_rule< max_seq_rule< 0 > >( __LINE__, __FILE__, "ab", result_type::local_failure );
diff --git a/packages/PEGTL/src/test/pegtl/contrib_iri.cpp b/packages/PEGTL/src/test/pegtl/contrib_iri.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..96890d71c3e66aa9f68db8f7c3f2486c99bcc890
--- /dev/null
+++ b/packages/PEGTL/src/test/pegtl/contrib_iri.cpp
@@ -0,0 +1,58 @@
+// Copyright (c) 2021 Kelvin Hammond
+// Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
+
+#if !defined( __cpp_exceptions )
+#include <iostream>
+int main()
+{
+   std::cout << "Exception support disabled, skipping test..." << std::endl;
+}
+#else
+
+#include "test.hpp"
+#include "verify_meta.hpp"
+#include "verify_rule.hpp"
+
+#include <tao/pegtl/contrib/iri.hpp>
+
+namespace TAO_PEGTL_NAMESPACE
+{
+   using GRAMMAR = must< iri::IRI, eof >;
+
+   void unit_test()
+   {
+      verify_analyze< GRAMMAR >( __LINE__, __FILE__, true, false );
+
+      verify_rule< GRAMMAR >( __LINE__, __FILE__, "https://en.wikipedia.org/wiki/Internationalized_Resource_Identifier", result_type::success );
+      verify_rule< GRAMMAR >( __LINE__, __FILE__, "https://en.wiktionary.org/wiki/%E1%BF%AC%CF%8C%CE%B4%CE%BF%CF%82", result_type::success );
+      verify_rule< GRAMMAR >( __LINE__, __FILE__, "https://en.wiktionary.org/wiki/Ῥόδος", result_type::success );
+      verify_rule< GRAMMAR >( __LINE__, __FILE__, "https://www.myfictionαlbank.com", result_type::success );
+      verify_rule< GRAMMAR >( __LINE__, __FILE__, "ftp://ftp.is.co.za/rfc/rfc1808.txt", result_type::success );
+      verify_rule< GRAMMAR >( __LINE__, __FILE__, "file:///C:/Users/Benutzer/Desktop/Uniform%20Resource%20Identifier.html", result_type::success );
+      verify_rule< GRAMMAR >( __LINE__, __FILE__, "file:///etc/fstab", result_type::success );
+      verify_rule< GRAMMAR >( __LINE__, __FILE__, "geo:48.33,14.122;u=22.5", result_type::success );
+      verify_rule< GRAMMAR >( __LINE__, __FILE__, "ldap://[2001:db8::7]/c=GB?objectClass?one", result_type::success );
+      verify_rule< GRAMMAR >( __LINE__, __FILE__, "gopher://gopher.floodgap.com", result_type::success );
+      verify_rule< GRAMMAR >( __LINE__, __FILE__, "mailto:John.Doe@example.com", result_type::success );
+      verify_rule< GRAMMAR >( __LINE__, __FILE__, "sip:911@pbx.mycompany.com", result_type::success );
+      verify_rule< GRAMMAR >( __LINE__, __FILE__, "news:comp.infosystems.www.servers.unix", result_type::success );
+      verify_rule< GRAMMAR >( __LINE__, __FILE__, "data:text/plain;charset=iso-8859-7,%be%fa%be", result_type::success );
+      verify_rule< GRAMMAR >( __LINE__, __FILE__, "tel:+1-816-555-1212", result_type::success );
+      verify_rule< GRAMMAR >( __LINE__, __FILE__, "telnet://192.0.2.16:80/", result_type::success );
+      verify_rule< GRAMMAR >( __LINE__, __FILE__, "urn:oasis:names:specification:docbook:dtd:xml:4.1.2", result_type::success );
+      verify_rule< GRAMMAR >( __LINE__, __FILE__, "git://github.com/rails/rails.git", result_type::success );
+      verify_rule< GRAMMAR >( __LINE__, __FILE__, "crid://broadcaster.com/movies/BestActionMovieEver", result_type::success );
+      verify_rule< GRAMMAR >( __LINE__, __FILE__, "http://nobody:password@example.org:8080/cgi-bin/script.php?action=submit&pageid=86392001#section_2", result_type::success );
+      verify_rule< GRAMMAR >( __LINE__, __FILE__, "quake://480fps.com:26000/", result_type::success );
+      verify_rule< GRAMMAR >( __LINE__, __FILE__, "ftp://300.300.300.300/foo", result_type::success );  // 300.300.300.300 is a valid hostname!
+
+      TAO_PEGTL_TEST_THROWS( parse< GRAMMAR >( memory_input( "", "" ) ) );
+   }
+
+}  // namespace TAO_PEGTL_NAMESPACE
+
+#include "main.hpp"
+
+#endif
diff --git a/packages/PEGTL/src/test/pegtl/contrib_json.cpp b/packages/PEGTL/src/test/pegtl/contrib_json.cpp
index b7a2945b4253f004afc6f404bad043c01d7c3423..62cf3b778a657c90cfc029cc83b85d97773f61ad 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_json.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_json.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/contrib_parse_tree.cpp b/packages/PEGTL/src/test/pegtl/contrib_parse_tree.cpp
index 2bcd188518fb5a1e4349bf547194d545014aab37..04a8a0a797fae9a1b0967c25af2aa0fb9e8294ac 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_parse_tree.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_parse_tree.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/contrib_parse_tree_to_dot.cpp b/packages/PEGTL/src/test/pegtl/contrib_parse_tree_to_dot.cpp
index 98bc76f467799e524adf41a45760974a395fec29..517092ab26237f66b7d563e84c84aebcba0af72b 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_parse_tree_to_dot.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_parse_tree_to_dot.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/contrib_partial_trace.cpp b/packages/PEGTL/src/test/pegtl/contrib_partial_trace.cpp
index d4e829852fedabf84738d838efffd19813aae49e..1c8f1a97786d7e4632dc596a573eca742b202641 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_partial_trace.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_partial_trace.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/contrib_predicates.cpp b/packages/PEGTL/src/test/pegtl/contrib_predicates.cpp
index 0af55ea6249487986aa669d490afc13e3b443e1c..00403c330686cb07f8dcd52d86192314f0db2588 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_predicates.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_predicates.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/contrib_print.cpp b/packages/PEGTL/src/test/pegtl/contrib_print.cpp
index 8b19e150c32fd3b6c0ec51b7c7f5daa06a76d521..2c2bde7e73dca9565e4c44b1ffcbda2216208e57 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_print.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_print.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <iostream>
 
diff --git a/packages/PEGTL/src/test/pegtl/contrib_raw_string.cpp b/packages/PEGTL/src/test/pegtl/contrib_raw_string.cpp
index f02f9bc666857746814364a11f4a42db6be4af76..ef6abda59265fd317fc3bded653ceb7559c6767d 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_raw_string.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_raw_string.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2016-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/contrib_remove_first_state.cpp b/packages/PEGTL/src/test/pegtl/contrib_remove_first_state.cpp
index bd2aedb94c297e6b1e848cd6d4a5eed29c25a7a9..c31ef5d40739576a07e59fcaad5db9a7a7e2bc07 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_remove_first_state.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_remove_first_state.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/contrib_remove_last_states.cpp b/packages/PEGTL/src/test/pegtl/contrib_remove_last_states.cpp
index 76dae2f5eabbde1bb755d861ea0bb0a03bdc0808..240c2abb0001d82c8e7dbc6596740900620c6f33 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_remove_last_states.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_remove_last_states.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/contrib_rep_one_min_max.cpp b/packages/PEGTL/src/test/pegtl/contrib_rep_one_min_max.cpp
index 2ee88f010e30eaecd2dae96365d1b7559451e044..69dd862afa5312d7ef385552e2dbf7a7d8ba1af3 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_rep_one_min_max.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_rep_one_min_max.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/contrib_rep_string.cpp b/packages/PEGTL/src/test/pegtl/contrib_rep_string.cpp
index 37b1b891e713ac4241adc14e80b12e1d1917b31b..91eda129eb8fc62b9a70c2caca00947cb4b02dc6 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_rep_string.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_rep_string.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/contrib_separated_seq.cpp b/packages/PEGTL/src/test/pegtl/contrib_separated_seq.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1a47a5fbeddf3a2c826a5ab95c96f1086d56f1a3
--- /dev/null
+++ b/packages/PEGTL/src/test/pegtl/contrib_separated_seq.cpp
@@ -0,0 +1,25 @@
+// Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
+
+#include <tao/pegtl/contrib/separated_seq.hpp>
+
+#include <type_traits>
+
+// clang-format off
+struct A {};
+struct B {};
+struct C {};
+struct D {};
+
+struct S {};
+// clang-format on
+
+using namespace TAO_PEGTL_NAMESPACE;
+static_assert( std::is_base_of_v< internal::seq<>, separated_seq< S > > );
+static_assert( std::is_base_of_v< internal::seq< A >, separated_seq< S, A > > );
+static_assert( std::is_base_of_v< internal::seq< A, S, B >, separated_seq< S, A, B > > );
+static_assert( std::is_base_of_v< internal::seq< A, S, B, S, C, S, D >, separated_seq< S, A, B, C, D > > );
+
+int main()
+{}
diff --git a/packages/PEGTL/src/test/pegtl/contrib_state_control.cpp b/packages/PEGTL/src/test/pegtl/contrib_state_control.cpp
index 2a68ace18232645dfb6ed36f219c2b7ea8cc3f2d..7fc3b2b2929a904fe4c30efb1bd3fbc8e107a986 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_state_control.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_state_control.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/test/pegtl/contrib_to_string.cpp b/packages/PEGTL/src/test/pegtl/contrib_to_string.cpp
index a956be930d82f62e4b3e9ff01dd03296f2fb6a0f..bd03ed78940f1abbf14f56d7abffd71c6c038717 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_to_string.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_to_string.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/contrib_trace1.cpp b/packages/PEGTL/src/test/pegtl/contrib_trace1.cpp
index 6a0212b023ad0cf91d4cd18427c3c485e727e401..31cc5c866e3d732f0c26a53276f534a1aca443f6 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_trace1.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_trace1.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <iostream>
 
diff --git a/packages/PEGTL/src/test/pegtl/contrib_trace2.cpp b/packages/PEGTL/src/test/pegtl/contrib_trace2.cpp
index a8a35a6b0bc027d8c24bf9d1a324e208e5b7d913..90112f7e896c7a3af151d23d884cf9990ccc9621 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_trace2.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_trace2.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/contrib_unescape.cpp b/packages/PEGTL/src/test/pegtl/contrib_unescape.cpp
index 7b010b6b1b815e8bbf04c1cd8c90ce6325a2d45e..9efda4b9001406bcfd82d09cc5a6aedb25b01f89 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_unescape.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_unescape.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2015-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/contrib_uri.cpp b/packages/PEGTL/src/test/pegtl/contrib_uri.cpp
index 0d855035fe140dba5cb11528600f806f3c6d015d..9162abe33b8b81c83a92a0d468ff3f686fcbd1ff 100644
--- a/packages/PEGTL/src/test/pegtl/contrib_uri.cpp
+++ b/packages/PEGTL/src/test/pegtl/contrib_uri.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/test/pegtl/control_unwind.cpp b/packages/PEGTL/src/test/pegtl/control_unwind.cpp
index b07dcdcccd91882f5db6d5514bc59ad6636e4ffc..468678e23268f6bd7d830d6f9a83a83cd4487acd 100644
--- a/packages/PEGTL/src/test/pegtl/control_unwind.cpp
+++ b/packages/PEGTL/src/test/pegtl/control_unwind.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/test/pegtl/data_cstring.cpp b/packages/PEGTL/src/test/pegtl/data_cstring.cpp
index fc1e5c270c48c1b833781650b3def2e6ed342fe7..f8e86171e85e0e00ac5ec5a0b0c706eb6097e222 100644
--- a/packages/PEGTL/src/test/pegtl/data_cstring.cpp
+++ b/packages/PEGTL/src/test/pegtl/data_cstring.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2016-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/demangle.cpp b/packages/PEGTL/src/test/pegtl/demangle.cpp
index c1ae7396e4c798e478f09547163bf4c54fd8c766..b7acf79c27f62ec3e8f1058213f34e29e284b7ac 100644
--- a/packages/PEGTL/src/test/pegtl/demangle.cpp
+++ b/packages/PEGTL/src/test/pegtl/demangle.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/discard_input.cpp b/packages/PEGTL/src/test/pegtl/discard_input.cpp
index 996d75c8a86512cf30ed192920c3212031f696a4..3dac97076d12635e3cb8ea9f209b262a3aedd043 100644
--- a/packages/PEGTL/src/test/pegtl/discard_input.cpp
+++ b/packages/PEGTL/src/test/pegtl/discard_input.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <string>
 
diff --git a/packages/PEGTL/src/test/pegtl/enable_control.cpp b/packages/PEGTL/src/test/pegtl/enable_control.cpp
index 29c76c1379a51cb120d4f3bcaee0b8dcc9a58a62..49429e39142d7a05b5074af83a41b5300e118dda 100644
--- a/packages/PEGTL/src/test/pegtl/enable_control.cpp
+++ b/packages/PEGTL/src/test/pegtl/enable_control.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <tao/pegtl.hpp>
 
diff --git a/packages/PEGTL/src/test/pegtl/error_message.cpp b/packages/PEGTL/src/test/pegtl/error_message.cpp
index d701fc24afaafc966c037ed1617b2e9559e810e7..220cfbc5292fb517020a67b5f256e5ad643ea3fd 100644
--- a/packages/PEGTL/src/test/pegtl/error_message.cpp
+++ b/packages/PEGTL/src/test/pegtl/error_message.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/test/pegtl/file_cstream.cpp b/packages/PEGTL/src/test/pegtl/file_cstream.cpp
index 93051028300e02b64231ee21f0fb8bbba8427969..d51dbd2b08f669e6a1edda1b18f82219fc8ab656 100644
--- a/packages/PEGTL/src/test/pegtl/file_cstream.cpp
+++ b/packages/PEGTL/src/test/pegtl/file_cstream.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2016-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <clocale>
 #include <cstdio>
diff --git a/packages/PEGTL/src/test/pegtl/file_file.cpp b/packages/PEGTL/src/test/pegtl/file_file.cpp
index 61abd5e31a1e1a73ae8b8cc8f0e139088fdc455a..fcc20af53b7d42e61ae4362a2a409d8b469be792 100644
--- a/packages/PEGTL/src/test/pegtl/file_file.cpp
+++ b/packages/PEGTL/src/test/pegtl/file_file.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2015-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_file.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/file_istream.cpp b/packages/PEGTL/src/test/pegtl/file_istream.cpp
index 41d1a84eb74e99e265c149fd9c32041e1a0f9933..cb4e3cae24b090ec563cd5c19279bf2a326f4820 100644
--- a/packages/PEGTL/src/test/pegtl/file_istream.cpp
+++ b/packages/PEGTL/src/test/pegtl/file_istream.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2016-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <cerrno>
 #include <fstream>
diff --git a/packages/PEGTL/src/test/pegtl/file_mmap.cpp b/packages/PEGTL/src/test/pegtl/file_mmap.cpp
index e9fd5804d06db85350aeadb05ccde36859561c96..afa83d1c3a635c36122aae00ca22010fabf00bf1 100644
--- a/packages/PEGTL/src/test/pegtl/file_mmap.cpp
+++ b/packages/PEGTL/src/test/pegtl/file_mmap.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 // this include gives us _POSIX_MAPPED_FILES to test and mmap_input<> if it is set
 #include <tao/pegtl/file_input.hpp>
diff --git a/packages/PEGTL/src/test/pegtl/file_read.cpp b/packages/PEGTL/src/test/pegtl/file_read.cpp
index 4e2ee3bb7e15d15fedfe770cb2ca6812bd515dc4..963120e7d66733c080c98cfefa17a703d6934879 100644
--- a/packages/PEGTL/src/test/pegtl/file_read.cpp
+++ b/packages/PEGTL/src/test/pegtl/file_read.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_file.hpp"
diff --git "a/packages/PEGTL/src/test/pegtl/file_\303\244\303\266\303\274\360\235\204\236_data.txt" "b/packages/PEGTL/src/test/pegtl/file_\303\244\303\266\303\274\360\235\204\236_data.txt"
new file mode 100644
index 0000000000000000000000000000000000000000..d1c7bba09c907f77a6eb263e90b8a5d5b7873a7e
--- /dev/null
+++ "b/packages/PEGTL/src/test/pegtl/file_\303\244\303\266\303\274\360\235\204\236_data.txt"
@@ -0,0 +1,11 @@
+dummy content
+dummy content
+dummy content
+dummy content
+dummy content
+dummy content
+dummy content
+dummy content
+dummy content
+dummy content
+dummy content
diff --git a/packages/PEGTL/src/test/pegtl/icu_general.cpp b/packages/PEGTL/src/test/pegtl/icu_general.cpp
index 7191cbbe4188c627ac35ee08505f51f91fa8dbc1..15e6e62cd7424d3c1c00687ea44675b607ae1c2e 100644
--- a/packages/PEGTL/src/test/pegtl/icu_general.cpp
+++ b/packages/PEGTL/src/test/pegtl/icu_general.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/internal_endian.cpp b/packages/PEGTL/src/test/pegtl/internal_endian.cpp
index 50296fd176bab4dca5a457de3248e2834d072bd1..f1bb6b1cb57b838aea115e93aa0b424931a758e5 100644
--- a/packages/PEGTL/src/test/pegtl/internal_endian.cpp
+++ b/packages/PEGTL/src/test/pegtl/internal_endian.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <tao/pegtl/contrib/internal/endian.hpp>
 
diff --git a/packages/PEGTL/src/test/pegtl/internal_file_mapper.cpp b/packages/PEGTL/src/test/pegtl/internal_file_mapper.cpp
index 322aec8ba0fa8b7763ef4449882b08782e312130..94f6e3f5bf94eb71c1e4e0bbdbb6e0dd0e653e91 100644
--- a/packages/PEGTL/src/test/pegtl/internal_file_mapper.cpp
+++ b/packages/PEGTL/src/test/pegtl/internal_file_mapper.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2015-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions ) || !defined( _POSIX_MAPPED_FILES )
 #include <iostream>
diff --git a/packages/PEGTL/src/test/pegtl/internal_file_opener.cpp b/packages/PEGTL/src/test/pegtl/internal_file_opener.cpp
index cea22c2271399fce03ae4f8dad545bf8468d111d..1ca75d9609c386e2fd1945b25c24868d29fb5b94 100644
--- a/packages/PEGTL/src/test/pegtl/internal_file_opener.cpp
+++ b/packages/PEGTL/src/test/pegtl/internal_file_opener.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2015-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions ) || !defined( _POSIX_MAPPED_FILES )
 #include <iostream>
diff --git a/packages/PEGTL/src/test/pegtl/limit_bytes.cpp b/packages/PEGTL/src/test/pegtl/limit_bytes.cpp
index 94e66c3f69bebc1748b9d86a322ee5518012492c..f376ad835650bd77f27cd284bf3323ef11bb7afa 100644
--- a/packages/PEGTL/src/test/pegtl/limit_bytes.cpp
+++ b/packages/PEGTL/src/test/pegtl/limit_bytes.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <tao/pegtl/contrib/limit_bytes.hpp>
 
diff --git a/packages/PEGTL/src/test/pegtl/limit_depth.cpp b/packages/PEGTL/src/test/pegtl/limit_depth.cpp
index 8fb1b2713c3091278d716a4857a9f34e3e1f2fe4..3f9df51639fef2d3eef3d5414845fd29c4163666 100644
--- a/packages/PEGTL/src/test/pegtl/limit_depth.cpp
+++ b/packages/PEGTL/src/test/pegtl/limit_depth.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <tao/pegtl/contrib/limit_depth.hpp>
 
diff --git a/packages/PEGTL/src/test/pegtl/main.hpp b/packages/PEGTL/src/test/pegtl/main.hpp
index f216e73dc4bbfe40be8f883eaa81a97bee648c26..cb09aebbe7c1832dce27c9f46c8a370d97a3ef47 100644
--- a/packages/PEGTL/src/test/pegtl/main.hpp
+++ b/packages/PEGTL/src/test/pegtl/main.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_SRC_TEST_PEGTL_MAIN_HPP
 #define TAO_PEGTL_SRC_TEST_PEGTL_MAIN_HPP
diff --git a/packages/PEGTL/src/test/pegtl/parse_error.cpp b/packages/PEGTL/src/test/pegtl/parse_error.cpp
index 6141e3efc7549ac1e48e8de36cb33ec5dcacd2cc..7fda8f61eea8ab51a9b077e7b8791097aba6e442 100644
--- a/packages/PEGTL/src/test/pegtl/parse_error.cpp
+++ b/packages/PEGTL/src/test/pegtl/parse_error.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/test/pegtl/pegtl_string_t.cpp b/packages/PEGTL/src/test/pegtl/pegtl_string_t.cpp
index ff3cc3ac8498e245134c5cc775803c2d548b49a8..41147a373d26bdb40c23f3cf6f600b544e212293 100644
--- a/packages/PEGTL/src/test/pegtl/pegtl_string_t.cpp
+++ b/packages/PEGTL/src/test/pegtl/pegtl_string_t.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2015-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <type_traits>
 
diff --git a/packages/PEGTL/src/test/pegtl/position.cpp b/packages/PEGTL/src/test/pegtl/position.cpp
index 666f182ab5bf79f2ab98ebfa07705a68cc099eb1..60590e47492627cc48448d98308871d62dc046e5 100644
--- a/packages/PEGTL/src/test/pegtl/position.cpp
+++ b/packages/PEGTL/src/test/pegtl/position.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2016-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/test/pegtl/restart_input.cpp b/packages/PEGTL/src/test/pegtl/restart_input.cpp
index 3cb964c88bc0019aa15c24201980b044659fcfa1..648d118289f8a425f61627e4ea9d2d9f206b6b43 100644
--- a/packages/PEGTL/src/test/pegtl/restart_input.cpp
+++ b/packages/PEGTL/src/test/pegtl/restart_input.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/result_type.hpp b/packages/PEGTL/src/test/pegtl/result_type.hpp
index 91b84a6e0906bb44bea4c375558d1537ea298705..9f8448f3e7de52869b388ab776d5e18101391415 100644
--- a/packages/PEGTL/src/test/pegtl/result_type.hpp
+++ b/packages/PEGTL/src/test/pegtl/result_type.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_SRC_TEST_PEGTL_RESULT_TYPE_HPP
 #define TAO_PEGTL_SRC_TEST_PEGTL_RESULT_TYPE_HPP
diff --git a/packages/PEGTL/src/test/pegtl/rule_action.cpp b/packages/PEGTL/src/test/pegtl/rule_action.cpp
index cdaae0c7c4df719f4d45cba93636eaa0cb51af17..c9e5a9f600340fdbf1f9c7cf3e46fb8f1e11b3be 100644
--- a/packages/PEGTL/src/test/pegtl/rule_action.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_action.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/rule_apply.cpp b/packages/PEGTL/src/test/pegtl/rule_apply.cpp
index 7c5082272d3380bbec89a75e933c4584d00c8a1d..6ae99f88a5b216b01eea245915289baeda386896 100644
--- a/packages/PEGTL/src/test/pegtl/rule_apply.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_apply.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/rule_apply0.cpp b/packages/PEGTL/src/test/pegtl/rule_apply0.cpp
index 678c424dd19fabf5df9f6b74823e6314656d5d8d..2c4fdfffe86bfe68f384ab84b53355d8a7b74fa3 100644
--- a/packages/PEGTL/src/test/pegtl/rule_apply0.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_apply0.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/rule_at.cpp b/packages/PEGTL/src/test/pegtl/rule_at.cpp
index 8d24f4e217fdcc1105129a1d5d49637355cdfc54..996081fda56252f71c57de571a4ade36b4bc0f12 100644
--- a/packages/PEGTL/src/test/pegtl/rule_at.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_at.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/rule_bof.cpp b/packages/PEGTL/src/test/pegtl/rule_bof.cpp
index e7e563ee7999af7fbd2ea856e67a139531d5baf6..7462e560b4edfeec65c4ae2960a58790beef7875 100644
--- a/packages/PEGTL/src/test/pegtl/rule_bof.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_bof.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/rule_bol.cpp b/packages/PEGTL/src/test/pegtl/rule_bol.cpp
index 34c63020bcedddef349c9428aab458df37b93ef0..837fcc331eebf5a82b9f6703c781d331470086f4 100644
--- a/packages/PEGTL/src/test/pegtl/rule_bol.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_bol.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/rule_bytes.cpp b/packages/PEGTL/src/test/pegtl/rule_bytes.cpp
index 2caa4834a28deb9ba00c0b2ca4830030fd961f3d..806640778857cfa885171e8a6046d8eb6f71c86d 100644
--- a/packages/PEGTL/src/test/pegtl/rule_bytes.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_bytes.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_char.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/rule_control.cpp b/packages/PEGTL/src/test/pegtl/rule_control.cpp
index baaef260eab0c59bb4ab438ddb156993fd5b5757..1b69ebd453c300e8de8cb37ce0543f3985b0e9a8 100644
--- a/packages/PEGTL/src/test/pegtl/rule_control.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_control.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/rule_disable.cpp b/packages/PEGTL/src/test/pegtl/rule_disable.cpp
index acbbb9f3f1a9a0832f878b86557a07579468bbb7..65570121b73fad4b103dae193cbb019180f982cc 100644
--- a/packages/PEGTL/src/test/pegtl/rule_disable.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_disable.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/rule_discard.cpp b/packages/PEGTL/src/test/pegtl/rule_discard.cpp
index 8e5b93d1e408edd016a1668c37e8522054244f3e..bce896337e55c24232496ef593ef926c1277cc5b 100644
--- a/packages/PEGTL/src/test/pegtl/rule_discard.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_discard.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/rule_enable.cpp b/packages/PEGTL/src/test/pegtl/rule_enable.cpp
index 9741545fe7390a753566f0b04b6ddb43fcb26c1b..63c4d1560339faaa290c9ead46c2b82517340528 100644
--- a/packages/PEGTL/src/test/pegtl/rule_enable.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_enable.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/rule_eof.cpp b/packages/PEGTL/src/test/pegtl/rule_eof.cpp
index b5098b40cf178a1329d0fa27d21de660a1503b12..8ed986e9b6a6d40821811d8dbe061b74cac8bd26 100644
--- a/packages/PEGTL/src/test/pegtl/rule_eof.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_eof.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_char.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/rule_failure.cpp b/packages/PEGTL/src/test/pegtl/rule_failure.cpp
index c1481d4d1d6c7d259fc00260553a31c514760434..19044b3446f767d1af60e7c46daf353d3486f8ab 100644
--- a/packages/PEGTL/src/test/pegtl/rule_failure.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_failure.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_char.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/rule_if_apply.cpp b/packages/PEGTL/src/test/pegtl/rule_if_apply.cpp
index 803d08f49198cb038546262a20a77ef821e1f23c..74602f9a77e384c5dc69eb3266d3bbaa32de5755 100644
--- a/packages/PEGTL/src/test/pegtl/rule_if_apply.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_if_apply.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_seqs.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/rule_if_must.cpp b/packages/PEGTL/src/test/pegtl/rule_if_must.cpp
index 6dd3754232ce1b333215ab30dc0cd496ae7920d8..13f7332ae56146a674760412432539bc0c494e85 100644
--- a/packages/PEGTL/src/test/pegtl/rule_if_must.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_if_must.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/test/pegtl/rule_if_must_else.cpp b/packages/PEGTL/src/test/pegtl/rule_if_must_else.cpp
index 0b7f370596405829571a99bdc956953e7a71c204..fa28715d0f1b4170f3607d94383a97e9820a0b27 100644
--- a/packages/PEGTL/src/test/pegtl/rule_if_must_else.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_if_must_else.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/test/pegtl/rule_if_then_else.cpp b/packages/PEGTL/src/test/pegtl/rule_if_then_else.cpp
index dc15f6fb0688e270487563f40fe501707caa2aa1..b1596f69085b28266f70d966bc52e8dcb7d1f2b2 100644
--- a/packages/PEGTL/src/test/pegtl/rule_if_then_else.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_if_then_else.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_ifmt.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/rule_list.cpp b/packages/PEGTL/src/test/pegtl/rule_list.cpp
index 106942f6dcc348a55c45d7f0fa958b64bfc50160..2abf474ee895277bf3d88d7d6a0e11e9ea4c5c11 100644
--- a/packages/PEGTL/src/test/pegtl/rule_list.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_list.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/rule_list_must.cpp b/packages/PEGTL/src/test/pegtl/rule_list_must.cpp
index 9de21172d522e07c5d71980b4965116b952f0708..0fab79e8a76524d455a32f45b3f39c0e9f052607 100644
--- a/packages/PEGTL/src/test/pegtl/rule_list_must.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_list_must.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/test/pegtl/rule_list_tail.cpp b/packages/PEGTL/src/test/pegtl/rule_list_tail.cpp
index d6e9dd59b8a507916550c7ca6e43f8506f286f18..41296111d8b3356af4771f20bd934d575e87d8ff 100644
--- a/packages/PEGTL/src/test/pegtl/rule_list_tail.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_list_tail.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/rule_minus.cpp b/packages/PEGTL/src/test/pegtl/rule_minus.cpp
index 4fd5f5722022b4b29f0ab9238338fa0cc0bdeffc..66c37197343fe2a3376554d14dc0e94827d639ec 100644
--- a/packages/PEGTL/src/test/pegtl/rule_minus.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_minus.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2016-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/rule_must.cpp b/packages/PEGTL/src/test/pegtl/rule_must.cpp
index f1d481132352e07f857cf5cd728a858934215f27..1065457c58dca1c5796bf9b15c6a88f27081fafa 100644
--- a/packages/PEGTL/src/test/pegtl/rule_must.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_must.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/test/pegtl/rule_not_at.cpp b/packages/PEGTL/src/test/pegtl/rule_not_at.cpp
index 77cc80fc64e807d35d24687accc75779df600218..aa5ba3b6641a13a1f2943f98887f149361e96c98 100644
--- a/packages/PEGTL/src/test/pegtl/rule_not_at.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_not_at.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/rule_opt.cpp b/packages/PEGTL/src/test/pegtl/rule_opt.cpp
index c07d22a4f619d5031b42a118d054f1f7f1ad2892..f05a738b7514319540fd3bf6f03f075bebbe5065 100644
--- a/packages/PEGTL/src/test/pegtl/rule_opt.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_opt.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/rule_opt_must.cpp b/packages/PEGTL/src/test/pegtl/rule_opt_must.cpp
index 5c1ce6e45daf75561ee38a232041913139276390..0adb157f657ccc8fb822782eab8d56c99bab3926 100644
--- a/packages/PEGTL/src/test/pegtl/rule_opt_must.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_opt_must.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/test/pegtl/rule_pad.cpp b/packages/PEGTL/src/test/pegtl/rule_pad.cpp
index 918b1c67b8e5c987b4362d568d66eb6c068b5e6e..2bbfc254b2294c58d678e30540717005fb450174 100644
--- a/packages/PEGTL/src/test/pegtl/rule_pad.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_pad.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/rule_pad_opt.cpp b/packages/PEGTL/src/test/pegtl/rule_pad_opt.cpp
index 5ff4ded637a490f8e82cfb6d5b5df0d586c3454c..d21bb55d8ef48f307918216618941aa71e7db600 100644
--- a/packages/PEGTL/src/test/pegtl/rule_pad_opt.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_pad_opt.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/rule_plus.cpp b/packages/PEGTL/src/test/pegtl/rule_plus.cpp
index d1a82ad3462177a93b7f34c6c64fb2b75463ff86..f0b0942d34399a8d700f7c234e54829f23cc628c 100644
--- a/packages/PEGTL/src/test/pegtl/rule_plus.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_plus.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/rule_raise.cpp b/packages/PEGTL/src/test/pegtl/rule_raise.cpp
index 25a67c8136e84fa840775efba6f2487c7121f856..7eaa05d97f5cdb1bc7f73c8c0c19e2934fad116e 100644
--- a/packages/PEGTL/src/test/pegtl/rule_raise.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_raise.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/test/pegtl/rule_rematch.cpp b/packages/PEGTL/src/test/pegtl/rule_rematch.cpp
index 22fca401d93226b98e27ff5f0ee98858d50168a3..3c505d0b0cd627d47e4bd71d415edc9c64d260f1 100644
--- a/packages/PEGTL/src/test/pegtl/rule_rematch.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_rematch.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2019-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/rule_rep.cpp b/packages/PEGTL/src/test/pegtl/rule_rep.cpp
index 7d7ef941f44228ffc9119e370c2d197cc6a1bc75..ce259438a70d88b1c3030ebaa95b4d1d47c6c614 100644
--- a/packages/PEGTL/src/test/pegtl/rule_rep.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_rep.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/rule_rep_max.cpp b/packages/PEGTL/src/test/pegtl/rule_rep_max.cpp
index 0b4a2c0d2c0c767e1f00281350aababa89fd8e49..2ff63e9cc90302325b70e90ffd2f53302b89fa0c 100644
--- a/packages/PEGTL/src/test/pegtl/rule_rep_max.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_rep_max.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/rule_rep_min.cpp b/packages/PEGTL/src/test/pegtl/rule_rep_min.cpp
index 8564aabbf972551159d06b6b3c2e2fb72e20db8d..614db7363e91a188c27cd03e8f38523e3b28b2be 100644
--- a/packages/PEGTL/src/test/pegtl/rule_rep_min.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_rep_min.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/rule_rep_min_max.cpp b/packages/PEGTL/src/test/pegtl/rule_rep_min_max.cpp
index 95d3db6311e1eaa0efb867d53f439821d23d6e1e..ebf85c500b889a5df9b39322f85c1336b04cf7e1 100644
--- a/packages/PEGTL/src/test/pegtl/rule_rep_min_max.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_rep_min_max.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/rule_rep_opt.cpp b/packages/PEGTL/src/test/pegtl/rule_rep_opt.cpp
index 0f5a6b6779f9b86fb9c5a0908c9d8a43e7ce3e57..c94a11ab7c0186463b1d714eea5e2b8d3b5f3917 100644
--- a/packages/PEGTL/src/test/pegtl/rule_rep_opt.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_rep_opt.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/rule_require.cpp b/packages/PEGTL/src/test/pegtl/rule_require.cpp
index e6598a4ee577bac69252a3997db216887a89a61d..b93e2e3a1c31ed9059de670ec98c3360a49697af 100644
--- a/packages/PEGTL/src/test/pegtl/rule_require.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_require.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2017-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/rule_seq.cpp b/packages/PEGTL/src/test/pegtl/rule_seq.cpp
index b80f478b4bbcc85ab12208163b0f3fb4109216b6..f5eac6bbdbaad5755b2bbe5f62ec99885f3e569c 100644
--- a/packages/PEGTL/src/test/pegtl/rule_seq.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_seq.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/rule_sor.cpp b/packages/PEGTL/src/test/pegtl/rule_sor.cpp
index ec4258c2c8f93c5735441b6828bad3fcc5702c0f..7d087cb73c1a279bfb1beff03e7f7d8d52a7f9a9 100644
--- a/packages/PEGTL/src/test/pegtl/rule_sor.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_sor.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/rule_star.cpp b/packages/PEGTL/src/test/pegtl/rule_star.cpp
index c6f46e0001cca3de8950ea8a5bb3a0e8efa467aa..df7d63e08164efe27e35c2572c2d7a9fe0f04dde 100644
--- a/packages/PEGTL/src/test/pegtl/rule_star.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_star.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/rule_star_must.cpp b/packages/PEGTL/src/test/pegtl/rule_star_must.cpp
index 20747861a808d444e93ffb956fd9f64ecf8e961d..6b22eac70ffe94e4180473135e29fa7d68374941 100644
--- a/packages/PEGTL/src/test/pegtl/rule_star_must.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_star_must.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/test/pegtl/rule_state.cpp b/packages/PEGTL/src/test/pegtl/rule_state.cpp
index 8d92565b91103aaf6675bed55e5aa21e55674aa2..f29701262982d0eb081acbc42725dbd792b09aa7 100644
--- a/packages/PEGTL/src/test/pegtl/rule_state.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_state.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/rule_success.cpp b/packages/PEGTL/src/test/pegtl/rule_success.cpp
index cf810c685992cd7c41b2903472ea57ceaa43805f..03e0835369ed22bb8981190a6284b90acb729421 100644
--- a/packages/PEGTL/src/test/pegtl/rule_success.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_success.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/rule_try_catch.cpp b/packages/PEGTL/src/test/pegtl/rule_try_catch.cpp
index c9a5103045946788beb63bc1a66a5ffb7054d887..909cc77d9943b7c76d1259cd9da90ceeec3d225e 100644
--- a/packages/PEGTL/src/test/pegtl/rule_try_catch.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_try_catch.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #if !defined( __cpp_exceptions )
 #include <iostream>
diff --git a/packages/PEGTL/src/test/pegtl/rule_until.cpp b/packages/PEGTL/src/test/pegtl/rule_until.cpp
index 2d8c7faccdadfefb58824b1111b4ceb43aa495b4..12385c15a5ee30481d2118d1df11bd327d4ea8e5 100644
--- a/packages/PEGTL/src/test/pegtl/rule_until.cpp
+++ b/packages/PEGTL/src/test/pegtl/rule_until.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_meta.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/test.hpp b/packages/PEGTL/src/test/pegtl/test.hpp
index 95c6a59b36da917cfc19d5f97720ba6af3c2051a..e25635ee2c02fbfe6ced5f1cfcd1079e9ec611d0 100644
--- a/packages/PEGTL/src/test/pegtl/test.hpp
+++ b/packages/PEGTL/src/test/pegtl/test.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_SRC_TEST_PEGTL_TEST_HPP
 #define TAO_PEGTL_SRC_TEST_PEGTL_TEST_HPP
diff --git a/packages/PEGTL/src/test/pegtl/test_empty.cpp b/packages/PEGTL/src/test/pegtl/test_empty.cpp
index b213f1ea81482c359ad0911c0daa12370a82a16c..bfc9967778d806fb6ee34979cba1ee8232a17c88 100644
--- a/packages/PEGTL/src/test/pegtl/test_empty.cpp
+++ b/packages/PEGTL/src/test/pegtl/test_empty.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 
diff --git a/packages/PEGTL/src/test/pegtl/test_result.cpp b/packages/PEGTL/src/test/pegtl/test_result.cpp
index 2bf7c5355c693e6c6e3473efb5bc3e289afead5f..86554fb558a4cad9e4a3fdb75af72188cee44082 100644
--- a/packages/PEGTL/src/test/pegtl/test_result.cpp
+++ b/packages/PEGTL/src/test/pegtl/test_result.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <sstream>
 
diff --git a/packages/PEGTL/src/test/pegtl/test_setup.cpp b/packages/PEGTL/src/test/pegtl/test_setup.cpp
index a6d87ed887155c48a31282948a19e32c9d3f5816..20c3737bfa10b1986bdca50114aeb613fc25cb75 100644
--- a/packages/PEGTL/src/test/pegtl/test_setup.cpp
+++ b/packages/PEGTL/src/test/pegtl/test_setup.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <iostream>
 #include <utility>
diff --git a/packages/PEGTL/src/test/pegtl/uint16_general.cpp b/packages/PEGTL/src/test/pegtl/uint16_general.cpp
index b1073af47f985541d612e2c3d4dfed8937feede3..134ecbb9baf87083b59a0b516cc1059d07d42a2f 100644
--- a/packages/PEGTL/src/test/pegtl/uint16_general.cpp
+++ b/packages/PEGTL/src/test/pegtl/uint16_general.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_char.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/uint32_general.cpp b/packages/PEGTL/src/test/pegtl/uint32_general.cpp
index 18dfd34e6d5450f5c6e9325a71447e055f03e5ee..f5f259df1b85902578c1fcb058fdd2011322d8bd 100644
--- a/packages/PEGTL/src/test/pegtl/uint32_general.cpp
+++ b/packages/PEGTL/src/test/pegtl/uint32_general.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_char.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/uint64_general.cpp b/packages/PEGTL/src/test/pegtl/uint64_general.cpp
index 54aba3821ce31a376b168940221aea6d7fb2c526..91daedb35648d9bef7a01ed482e5e4f92c31ffaf 100644
--- a/packages/PEGTL/src/test/pegtl/uint64_general.cpp
+++ b/packages/PEGTL/src/test/pegtl/uint64_general.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_char.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/uint8_general.cpp b/packages/PEGTL/src/test/pegtl/uint8_general.cpp
index 211fd7399410bd8af471eaf0e89bcbe8b70afb38..ee6baff86d76ca5992d7171392a293c2a9761111 100644
--- a/packages/PEGTL/src/test/pegtl/uint8_general.cpp
+++ b/packages/PEGTL/src/test/pegtl/uint8_general.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2018-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_char.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/utf16_general.cpp b/packages/PEGTL/src/test/pegtl/utf16_general.cpp
index 12461ca55093149fcd76f465ed54a43c75b9c4e6..7e32a8cce8a3e4de9d92dc00bc419fe98c885edb 100644
--- a/packages/PEGTL/src/test/pegtl/utf16_general.cpp
+++ b/packages/PEGTL/src/test/pegtl/utf16_general.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2015-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_rule.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/utf32_general.cpp b/packages/PEGTL/src/test/pegtl/utf32_general.cpp
index c63a44dfeb1edc2a2c55663bd8a47a4eab8f5945..21895a9dfb3534c4a639c6a3c2d7d54a3cbc4876 100644
--- a/packages/PEGTL/src/test/pegtl/utf32_general.cpp
+++ b/packages/PEGTL/src/test/pegtl/utf32_general.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_rule.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/utf8_general.cpp b/packages/PEGTL/src/test/pegtl/utf8_general.cpp
index c61663e1bf1a857480509cc452e595fac5557564..cab108dc67bfa4766f7b83f505a268eb5177cbd4 100644
--- a/packages/PEGTL/src/test/pegtl/utf8_general.cpp
+++ b/packages/PEGTL/src/test/pegtl/utf8_general.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include "test.hpp"
 #include "verify_char.hpp"
diff --git a/packages/PEGTL/src/test/pegtl/verify_char.hpp b/packages/PEGTL/src/test/pegtl/verify_char.hpp
index 4f48fabc0aca00da4f63137ccdcef7780ab01694..32290bc4f82a36d1bf4763c1a5d4bb11205fb122 100644
--- a/packages/PEGTL/src/test/pegtl/verify_char.hpp
+++ b/packages/PEGTL/src/test/pegtl/verify_char.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_SRC_TEST_PEGTL_VERIFY_CHAR_HPP
 #define TAO_PEGTL_SRC_TEST_PEGTL_VERIFY_CHAR_HPP
diff --git a/packages/PEGTL/src/test/pegtl/verify_file.hpp b/packages/PEGTL/src/test/pegtl/verify_file.hpp
index c35c972488173d7559ec0fd719eba548fc6a3c68..b8a27230c97778eb286be3387d25dffb02b6efb9 100644
--- a/packages/PEGTL/src/test/pegtl/verify_file.hpp
+++ b/packages/PEGTL/src/test/pegtl/verify_file.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_SRC_TEST_PEGTL_VERIFY_FILE_HPP
 #define TAO_PEGTL_SRC_TEST_PEGTL_VERIFY_FILE_HPP
diff --git a/packages/PEGTL/src/test/pegtl/verify_ifmt.hpp b/packages/PEGTL/src/test/pegtl/verify_ifmt.hpp
index c957693b54fe1492dafe2412eb1dc1e452433173..1dffef16e4b02cbfd93e8e6983cb0e14c5afbfcc 100644
--- a/packages/PEGTL/src/test/pegtl/verify_ifmt.hpp
+++ b/packages/PEGTL/src/test/pegtl/verify_ifmt.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_SRC_TEST_PEGTL_VERIFY_IFMT_HPP
 #define TAO_PEGTL_SRC_TEST_PEGTL_VERIFY_IFMT_HPP
diff --git a/packages/PEGTL/src/test/pegtl/verify_impl.hpp b/packages/PEGTL/src/test/pegtl/verify_impl.hpp
index aecc70b1823ec8a0f63b3e855379f79f8b4badc5..6a4a93f898f08aa29558eb334bc0b0e4d11326df 100644
--- a/packages/PEGTL/src/test/pegtl/verify_impl.hpp
+++ b/packages/PEGTL/src/test/pegtl/verify_impl.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_SRC_TEST_PEGTL_VERIFY_IMPL_HPP
 #define TAO_PEGTL_SRC_TEST_PEGTL_VERIFY_IMPL_HPP
diff --git a/packages/PEGTL/src/test/pegtl/verify_meta.hpp b/packages/PEGTL/src/test/pegtl/verify_meta.hpp
index e7350f132df2111ac520b6a088f73f355bfba0a0..40d871210d756f4c90579c75281d5420395bac56 100644
--- a/packages/PEGTL/src/test/pegtl/verify_meta.hpp
+++ b/packages/PEGTL/src/test/pegtl/verify_meta.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_SRC_TEST_PEGTL_VERIFY_META_HPP
 #define TAO_PEGTL_SRC_TEST_PEGTL_VERIFY_META_HPP
diff --git a/packages/PEGTL/src/test/pegtl/verify_rule.hpp b/packages/PEGTL/src/test/pegtl/verify_rule.hpp
index 390a5a369ae670a094512aed100c9a611049f035..d24076b9d4eb38d08fa8b4b0d4be52c6e7238474 100644
--- a/packages/PEGTL/src/test/pegtl/verify_rule.hpp
+++ b/packages/PEGTL/src/test/pegtl/verify_rule.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_SRC_TEST_PEGTL_VERIFY_RULE_HPP
 #define TAO_PEGTL_SRC_TEST_PEGTL_VERIFY_RULE_HPP
diff --git a/packages/PEGTL/src/test/pegtl/verify_seqs.hpp b/packages/PEGTL/src/test/pegtl/verify_seqs.hpp
index 1b01aad29e054acc426ae17be48db5854651c103..487c995276b64ccbd82fd75496229b6fa5289e9f 100644
--- a/packages/PEGTL/src/test/pegtl/verify_seqs.hpp
+++ b/packages/PEGTL/src/test/pegtl/verify_seqs.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2014-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #ifndef TAO_PEGTL_SRC_TEST_PEGTL_VERIFY_SEQS_HPP
 #define TAO_PEGTL_SRC_TEST_PEGTL_VERIFY_SEQS_HPP
diff --git a/packages/PEGTL/src/test/pegtl/visit.cpp b/packages/PEGTL/src/test/pegtl/visit.cpp
index 2bdcb86e2b0229a8f1bcdc6ec5b36f27b6b9640c..2b30442f05dcceaa0c7de9fba8b8d901de8e5a94 100644
--- a/packages/PEGTL/src/test/pegtl/visit.cpp
+++ b/packages/PEGTL/src/test/pegtl/visit.cpp
@@ -1,5 +1,6 @@
 // Copyright (c) 2020-2021 Dr. Colin Hirsch and Daniel Frey
-// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
+// Distributed under the Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)
 
 #include <string>
 #include <vector>
diff --git a/packages/kokkos/.clang-format-ignore b/packages/kokkos/.clang-format-ignore
index b163a2bfeaf4e865925fb96f46260fd3ec2cfcb9..43d242c3106a29c063e0258bbd4e4553f66f883c 100644
--- a/packages/kokkos/.clang-format-ignore
+++ b/packages/kokkos/.clang-format-ignore
@@ -1,2 +1,3 @@
 core/unit_test/config/results/*
 tpls/gtest/gtest/*
+core/src/desul/*
diff --git a/packages/kokkos/.clang-tidy b/packages/kokkos/.clang-tidy
index 207a105c5bdf60b807db528d612aac89e6bb88b6..2b0d6e51d438948c2d5ef85e100c97ca16184e9b 100644
--- a/packages/kokkos/.clang-tidy
+++ b/packages/kokkos/.clang-tidy
@@ -1,3 +1,3 @@
-Checks: '-*,kokkos-*,modernize-use-using,modernize-use-nullptr'
+Checks: '-*,kokkos-*,modernize-use-using,modernize-use-nullptr,cppcoreguidelines-pro-type-cstyle-cast'
 FormatStyle: file
 HeaderFilterRegex: '.*/*.hpp'
diff --git a/packages/kokkos/.github/workflows/continuous-integration-workflow.yml b/packages/kokkos/.github/workflows/continuous-integration-workflow.yml
index a9dc0ec86cd77c49cb6958d01608ca47e5e0dab9..b76167f330a87eaf79af25f706c33d3e910865d1 100644
--- a/packages/kokkos/.github/workflows/continuous-integration-workflow.yml
+++ b/packages/kokkos/.github/workflows/continuous-integration-workflow.yml
@@ -40,29 +40,29 @@ jobs:
           path: ~/.ccache
           key: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${github.ref}-${{ github.sha }}
           restore-keys: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${{github.ref}}
-      - name: Get trial license
-        if: ${{ matrix.cxx == 'icpc' }}
-        run: |
-          mkdir ~/Licenses
-          curl https://dynamicinstaller.intel.com/api/v2/license > ~/Licenses/intel.lic
       - name: maybe_disable_death_tests
         if: ${{ matrix.distro == 'fedora:rawhide' }}
         run: echo "GTEST_FILTER=-*DeathTest*" >> $GITHUB_ENV
-      - name: build-and-test
+      - name: CMake
         run: |
-          ccache -z
-          cmake \
+          cmake -B builddir \
             -DCMAKE_INSTALL_PREFIX=/usr \
             -DKokkos_ENABLE_HWLOC=ON \
             -DKokkos_ENABLE_OPENMP=${{ matrix.openmp }} \
             -DKokkos_ENABLE_TESTS=ON \
             -DKokkos_ENABLE_EXAMPLES=ON \
+            -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
+            -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
             -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \
-            -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \
-            -DBUILD_NAME=${{ matrix.distro }}-${{ matrix.cxx }} \
-            -DBUILD_JOBS=2 -DBINARY_DIR=builddir -DSITE=GitHub-Linux \
-            -P cmake/KokkosCI.cmake
+            -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }}
+      - name: Build
+        run: |
+          ccache -z
+          cmake --build builddir --parallel 2
           ccache -s
+      - name: Tests
+        working-directory: builddir
+        run: ctest --output-on-failure
       - name: Test DESTDIR Install
         run: DESTDIR=${PWD}/install cmake --build builddir --target install && rm -rf ${PWD}/install/usr && rmdir ${PWD}/install
       - name: Install
diff --git a/packages/kokkos/.github/workflows/osx.yml b/packages/kokkos/.github/workflows/osx.yml
index 855b557c829a609f34b82c7e5f307eef60cf0ede..178af12405cc2f6fc24a5ae46adf034b1c73a94e 100644
--- a/packages/kokkos/.github/workflows/osx.yml
+++ b/packages/kokkos/.github/workflows/osx.yml
@@ -21,15 +21,19 @@ jobs:
 
     steps:
       - uses: actions/checkout@v2
-      - name: build-and-test
+      - name: configure
         run:
-          cmake
+          cmake -B build .
             -DKokkos_ENABLE_${{ matrix.backend }}=On
             -DCMAKE_CXX_FLAGS="-Werror"
             -DCMAKE_CXX_STANDARD=14
             -DKokkos_ENABLE_COMPILER_WARNINGS=ON
+            -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF
             -DKokkos_ENABLE_TESTS=On
             -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }}
-            -DBUILD_NAME=macOS-${{ matrix.backend }}
-            -DTARGET=install -DBUILD_JOBS=2 -DSITE=GitHub-OSX
-            -P cmake/KokkosCI.cmake
+      - name: build
+        run:
+          cmake --build build --parallel 2
+      - name: test
+        working-directory: build
+        run: ctest --output-on-failure
diff --git a/packages/kokkos/.gitrepo b/packages/kokkos/.gitrepo
index 85e71521db3fbaa780bb77fb42f5c3e74ae26800..bfbe5e6fd3ec3ae381fe5adbd8b39d0797bff2fa 100644
--- a/packages/kokkos/.gitrepo
+++ b/packages/kokkos/.gitrepo
@@ -6,7 +6,7 @@
 [subrepo]
 	remote = git@github.com:kokkos/kokkos.git
 	branch = master
-	commit = c28a8b03288b185f846ddfb1b7c08213e12e2634
-	parent = 2c8a5742df289f02f5ca31bce1e293dbfdb1701e
+	commit = 2879e23507bcb21adb739d6317b3430f665de4a6
+	parent = 36833c0c0fc1a841eaed63df6b7d34609307f2a5
 	method = merge
 	cmdver = 0.4.3
diff --git a/packages/kokkos/.jenkins b/packages/kokkos/.jenkins
index 001171d648e7cfb2236d17439720562707faaab4..09e8515e96f2bb8255bcaba7304780b484f303ea 100644
--- a/packages/kokkos/.jenkins
+++ b/packages/kokkos/.jenkins
@@ -5,9 +5,12 @@ pipeline {
         CCACHE_DIR = '/tmp/ccache'
         CCACHE_MAXSIZE = '10G'
         CCACHE_CPP2 = 'true'
-        BUILD_JOBS = 8
-        SITE = 'Jenkins'
     }
+
+    options {
+        timeout(time: 6, unit: 'HOURS')
+    }
+
     stages {
         stage('Clang-Format') {
             agent {
@@ -36,7 +39,7 @@ pipeline {
                     }
                     steps {
                         sh 'ccache --zero-stats'
-                        sh '''rm -rf build && \
+                        sh '''rm -rf build && mkdir -p build && cd build && \
                               cmake \
                                 -DCMAKE_BUILD_TYPE=Release \
                                 -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
@@ -44,13 +47,15 @@ pipeline {
                                 -DCMAKE_CXX_FLAGS="-Werror -Wno-unknown-cuda-version -Wno-gnu-zero-variadic-macro-arguments" \
                                 -DKokkos_ARCH_VOLTA70=ON \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
+                                -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
+                                -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
                                 -DKokkos_ENABLE_EXAMPLES=ON \
                                 -DKokkos_ENABLE_TESTS=ON \
                                 -DKokkos_ENABLE_SYCL=ON \
                                 -DKokkos_ENABLE_UNSUPPORTED_ARCHS=ON \
                                 -DCMAKE_CXX_STANDARD=17 \
-                                -DBUILD_NAME=${STAGE_NAME} \
-                              -P cmake/KokkosCI.cmake'''
+                              .. && \
+                              make -j8 && ctest --verbose'''
                     }
                     post {
                         always {
@@ -58,12 +63,12 @@ pipeline {
                         }
                     }
                 }
-                stage('HIP-ROCm-3.8-C++14') {
+                stage('HIP-ROCm-4.2-C++14') {
                     agent {
                         dockerfile {
                             filename 'Dockerfile.hipcc'
                             dir 'scripts/docker'
-                            additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:3.8'
+                            additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:4.2'
                             label 'rocm-docker && vega'
                             args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES'
                         }
@@ -72,24 +77,23 @@ pipeline {
                         OMP_NUM_THREADS = 8
                         OMP_PLACES = 'threads'
                         OMP_PROC_BIND = 'spread'
-                        LC_ALL = 'C'
                     }
                     steps {
                         sh 'ccache --zero-stats'
                         sh 'echo "/opt/rocm/llvm/lib" > /etc/ld.so.conf.d/llvm.conf && ldconfig'
-                        sh '''rm -rf build && \
+                        sh '''rm -rf build && mkdir -p build && cd build && \
                               cmake \
                                 -DCMAKE_BUILD_TYPE=Debug \
                                 -DCMAKE_CXX_COMPILER=hipcc \
                                 -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument -DNDEBUG" \
                                 -DCMAKE_CXX_STANDARD=14 \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
+                                -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \
                                 -DKokkos_ENABLE_TESTS=ON \
                                 -DKokkos_ENABLE_HIP=ON \
-                                -DKokkos_ARCH_VEGA906=ON \
                                 -DKokkos_ENABLE_OPENMP=ON \
-                                -DBUILD_NAME=${STAGE_NAME} \
-                              -P cmake/KokkosCI.cmake'''
+                              .. && \
+                              make -j8 && ctest --verbose'''
                     }
                     post {
                         always {
@@ -97,33 +101,73 @@ pipeline {
                         }
                     }
                 }
-                stage('HIP-ROCm-3.8-C++17') {
+                stage('HIP-ROCm-4.2-C++17') {
                     agent {
                         dockerfile {
                             filename 'Dockerfile.hipcc'
                             dir 'scripts/docker'
-                            additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:3.8'
+                            additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:4.2'
                             label 'rocm-docker && vega'
                             args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES'
                         }
                     }
-                    environment {
-                        LC_ALL = 'C'
-                    }
                     steps {
                         sh 'ccache --zero-stats'
-                        sh '''rm -rf build && \
+                        sh '''rm -rf build && mkdir -p build && cd build && \
                               cmake \
                                 -DCMAKE_BUILD_TYPE=RelWithDebInfo \
                                 -DCMAKE_CXX_COMPILER=hipcc \
                                 -DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \
                                 -DCMAKE_CXX_STANDARD=17 \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
+                                -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
+                                -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
                                 -DKokkos_ENABLE_TESTS=ON \
                                 -DKokkos_ENABLE_HIP=ON \
+                              .. && \
+                              make -j8 && ctest --verbose'''
+                    }
+                    post {
+                        always {
+                            sh 'ccache --show-stats'
+                        }
+                    }
+                }
+                stage('OPENMPTARGET-ROCm-4.2') {
+                    agent {
+                        dockerfile {
+                            filename 'Dockerfile.hipcc'
+                            dir 'scripts/docker'
+                            additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:4.2'
+                            label 'rocm-docker && vega && AMD_Radeon_Instinct_MI60'
+                            args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES'
+                        }
+                    }
+                    environment {
+                        OMP_NUM_THREADS = 8
+                        OMP_PLACES = 'threads'
+                        OMP_PROC_BIND = 'spread'
+                        LC_ALL = 'C'
+                    }
+                    steps {
+                        sh 'ccache --zero-stats'
+                        sh 'echo "/opt/rocm/llvm/lib" > /etc/ld.so.conf.d/llvm.conf && ldconfig'
+                        sh '''rm -rf build && \
+                              cmake \
+                                -Bbuild \
+                                -DCMAKE_BUILD_TYPE=Debug \
+                                -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
+                                -DCMAKE_CXX_STANDARD=17 \
+                                -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
+                                -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \
+                                -DKokkos_ENABLE_TESTS=ON \
+                                -DKokkos_ENABLE_OPENMPTARGET=ON \
+                                -DKokkos_ENABLE_OPENMP=ON \
                                 -DKokkos_ARCH_VEGA906=ON \
-                                -DBUILD_NAME=${STAGE_NAME} \
-                              -P cmake/KokkosCI.cmake'''
+                              && \
+                              cmake --build build --parallel ${BUILD_JOBS} && \
+                              cd build && ctest --output-on-failure
+                        '''
                     }
                     post {
                         always {
@@ -142,19 +186,21 @@ pipeline {
                     }
                     steps {
                         sh 'ccache --zero-stats'
-                        sh '''rm -rf build && \
+                        sh '''rm -rf build && mkdir -p build && cd build && \
                               cmake \
                                 -DCMAKE_BUILD_TYPE=RelWithDebInfo \
                                 -DCMAKE_CXX_COMPILER=clang++ \
                                 -DCMAKE_CXX_FLAGS="-Wno-unknown-cuda-version -Werror -Wno-undefined-internal -Wno-pass-failed" \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
+                                -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
+                                -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
                                 -DKokkos_ENABLE_TESTS=ON \
                                 -DKokkos_ENABLE_TUNING=ON \
                                 -DKokkos_ENABLE_OPENMPTARGET=ON \
                                 -DKokkos_ARCH_VOLTA70=ON \
                                 -DCMAKE_CXX_STANDARD=17 \
-                                -DBUILD_NAME=${STAGE_NAME} \
-                              -P cmake/KokkosCI.cmake'''
+                              .. && \
+                              make -j8 && ctest --verbose'''
                     }
                     post {
                         always {
@@ -173,7 +219,7 @@ pipeline {
                     }
                     steps {
                         sh 'ccache --zero-stats'
-                        sh '''rm -rf build && \
+                        sh '''rm -rf build && mkdir -p build && cd build && \
                               cmake \
                                 -DCMAKE_BUILD_TYPE=Release \
                                 -DCMAKE_CXX_CLANG_TIDY="clang-tidy;-warnings-as-errors=*" \
@@ -182,13 +228,15 @@ pipeline {
                                 -DCMAKE_CXX_FLAGS=-Werror \
                                 -DCMAKE_CXX_STANDARD=14 \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
+                                -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
+                                -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
                                 -DKokkos_ENABLE_TESTS=ON \
                                 -DKokkos_ENABLE_CUDA=ON \
                                 -DKokkos_ENABLE_CUDA_LAMBDA=ON \
                                 -DKokkos_ENABLE_TUNING=ON \
                                 -DKokkos_ARCH_VOLTA70=ON \
-                                -DBUILD_NAME=${STAGE_NAME} \
-                              -P cmake/KokkosCI.cmake'''
+                              .. && \
+                              make -j8 && ctest --verbose'''
                     }
                     post {
                         always {
@@ -244,7 +292,7 @@ pipeline {
                     steps {
                         sh 'ccache --zero-stats'
                         sh '''rm -rf install && mkdir -p install && \
-                              rm -rf build && \
+                              rm -rf build && mkdir -p build && cd build && \
                               cmake \
                                 -DCMAKE_BUILD_TYPE=Release \
                                 -DCMAKE_CXX_COMPILER=g++-8 \
@@ -256,10 +304,12 @@ pipeline {
                                 -DKokkos_ENABLE_CUDA_LAMBDA=OFF \
                                 -DKokkos_ENABLE_CUDA_UVM=ON \
                                 -DKokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON \
-                                -DCMAKE_INSTALL_PREFIX=${PWD}/install \
-                                -DBUILD_NAME=${STAGE_NAME} \
-                                -DTARGET=install \
-                              -P cmake/KokkosCI.cmake && \
+                                -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
+                                -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
+                                -DCMAKE_INSTALL_PREFIX=${PWD}/../install \
+                              .. && \
+                              make -j8 install && \
+                              cd .. && \
                               rm -rf build-tests && mkdir -p build-tests && cd build-tests && \
                               export CMAKE_PREFIX_PATH=${PWD}/../install && \
                               cmake \
@@ -302,7 +352,7 @@ pipeline {
                     }
                     steps {
                         sh 'ccache --zero-stats'
-                        sh '''rm -rf build && \
+                        sh '''rm -rf build && mkdir -p build && cd build && \
                               cmake \
                                 -DCMAKE_BUILD_TYPE=Debug \
                                 -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
@@ -312,14 +362,14 @@ pipeline {
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
                                 -DKokkos_ENABLE_DEBUG=ON \
                                 -DKokkos_ENABLE_DEBUG_BOUNDS_CHECK=ON \
+                                -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \
                                 -DKokkos_ENABLE_TESTS=ON \
                                 -DKokkos_ENABLE_CUDA=ON \
                                 -DKokkos_ENABLE_CUDA_LAMBDA=ON \
                                 -DKokkos_ENABLE_LIBDL=OFF \
-                                -DBUILD_NAME=${STAGE_NAME} \
-                                -DTARGET=install \
-                              -P cmake/KokkosCI.cmake && \
-                              cd example/build_cmake_in_tree && \
+                              .. && \
+                              make -j8 && ctest --verbose && \
+                              cd ../example/build_cmake_in_tree && \
                               rm -rf build && mkdir -p build && cd build && \
                               cmake -DCMAKE_CXX_STANDARD=14 .. && make -j8 && ctest --verbose'''
                     }
@@ -342,18 +392,21 @@ pipeline {
                         OMP_PROC_BIND = 'true'
                     }
                     steps {
-                        sh '''rm -rf build && \
+                        sh '''rm -rf build && mkdir -p build && cd build && \
                               cmake \
                                 -DCMAKE_BUILD_TYPE=Release \
                                 -DCMAKE_CXX_STANDARD=14 \
                                 -DCMAKE_CXX_FLAGS=-Werror \
                                 -DKokkos_ENABLE_COMPILER_WARNINGS=ON \
+                                -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
+                                -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
                                 -DKokkos_ENABLE_TESTS=ON \
                                 -DKokkos_ENABLE_OPENMP=ON \
                                 -DKokkos_ENABLE_LIBDL=OFF \
-                                -DBUILD_NAME=${STAGE_NAME} \
-                              -P cmake/KokkosCI.cmake && \
-                              gcc -I$PWD/core/src core/unit_test/tools/TestCInterface.c'''
+                                -DKokkos_ENABLE_LIBQUADMATH=ON \
+                                -DCMAKE_PREFIX_PATH=/usr/local/lib/gcc/x86_64-unknown-linux-gnu/5.3.0 \
+                              .. && \
+                              make -j8 && ctest --verbose && gcc -I$PWD/../core/src/ ../core/unit_test/tools/TestCInterface.c'''
                     }
                 }
             }
diff --git a/packages/kokkos/.travis.yml b/packages/kokkos/.travis.yml
index 04ef01c1602cf87aae3e39225037d65f49651f62..87d0fd5cf6ed8b9ebc158d1eb8bbe91d56101963 100644
--- a/packages/kokkos/.travis.yml
+++ b/packages/kokkos/.travis.yml
@@ -67,14 +67,13 @@ install:
 
 before_script:
   - ccache -z
-  - if [[ ${COVERAGE} ]]; then export CXX="${CXX} --coverage"; export BUILD_NAME_SUFFIX="-Coverage"; fi
+  - if [[ ${COVERAGE} ]]; then export CXX="${CXX} --coverage"; fi
   - if [[ ! ${CMAKE_BUILD_TYPE} ]]; then export CXXFLAGS="${CXXFLAGS} -O2"; fi
 
 script:
   - export OMP_NUM_THREADS=2
   - export OMP_PLACES=threads
   - export OMP_PROC_BIND=spread
-  - export BUILD_JOBS=2
   # LD_LIBRARY_PATH workaround to find clang's libomp: https://github.com/travis-ci/travis-ci/issues/8613
   - if [[ ${CC} = clang ]]; then export LD_LIBRARY_PATH=/usr/local/clang/lib${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH; fi
   # enable ccache for clang on linux and add CCACHE_CPP2 to avoid 'Argument unused during compilation -I...' warning
@@ -82,17 +81,17 @@ script:
       ln -s /usr/bin/ccache $HOME/bin/clang++;
       export CCACHE_CPP2=yes;
     fi
-  - cmake
+  - mkdir build &&
+    pushd build &&
+    cmake ..
           ${BACKEND:+-DKokkos_ENABLE_${BACKEND}=On}
           -DCMAKE_CXX_FLAGS="${CXXFLAGS} -Werror"
           -DCMAKE_CXX_STANDARD=14
           -DKokkos_ENABLE_COMPILER_WARNINGS=ON
           -DKokkos_ENABLE_TESTS=On
-          ${CMAKE_BUILD_TYPE:+-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}}
-          -DBUILD_NAME="${CC}-${BACKEND}${BUILD_NAME_SUFFIX}"
-          -DSITE=Travis
-          -P cmake/KokkosCI.cmake &&
-    pushd build &&
+          ${CMAKE_BUILD_TYPE:+-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}} &&
+    make VERBOSE=1 -j2 &&
+    travis_wait 60 make test CTEST_OUTPUT_ON_FAILURE=1 &&
     make install DESTDIR=${PWD}/install && rm -rf ${PWD}/install/usr/local && rmdir ${PWD}/install/usr &&
     popd
 
diff --git a/packages/kokkos/CHANGELOG.md b/packages/kokkos/CHANGELOG.md
index 7bb6de4cd924051c621bec0ac2cca5f734960e9e..2e779791dde2a83394662f640f90626f66696f28 100644
--- a/packages/kokkos/CHANGELOG.md
+++ b/packages/kokkos/CHANGELOG.md
@@ -1,5 +1,165 @@
 # Change Log
 
+## [3.5.00](https://github.com/kokkos/kokkos/tree/3.5.00) (2021-10-19)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/3.4.01...3.5.00)
+
+### Features:
+
+- Add support for quad-precision math functions/traits [\#4098](https://github.com/kokkos/kokkos/pull/4098)
+- Adding ExecutionSpace partitioning function [\#4096](https://github.com/kokkos/kokkos/pull/4096)
+- Improve Python Interop Capabilities [\#4065](https://github.com/kokkos/kokkos/pull/4065)
+- Add half_t Kokkos::rand specialization [\#3922](https://github.com/kokkos/kokkos/pull/3922)
+- Add math special functions: erf, erfcx, expint1, Bessel functions, Hankel functions [\#3920](https://github.com/kokkos/kokkos/pull/3920)
+- Add missing common mathematical functions [\#4043](https://github.com/kokkos/kokkos/pull/4043) [\#4036](https://github.com/kokkos/kokkos/pull/4036) [\#4034](https://github.com/kokkos/kokkos/pull/4034)
+- Let the numeric traits be SFINAE-friendly [\#4038](https://github.com/kokkos/kokkos/pull/4038)
+- Add Desul atomics - enabling memory-order and memory-scope parameters [\#3247](https://github.com/kokkos/kokkos/pull/3247)
+- Add detection idiom from the C++ standard library extension version 2 [\#3980](https://github.com/kokkos/kokkos/pull/3980)
+- Fence Profiling Support in all backends [\#3966](https://github.com/kokkos/kokkos/pull/3966) [\#4304](https://github.com/kokkos/kokkos/pull/4304) [\#4258](https://github.com/kokkos/kokkos/pull/4258) [\#4232](https://github.com/kokkos/kokkos/pull/4232)
+- Significant SYCL enhancements (see below)
+
+### Deprecations:
+
+- Deprecate CUDA_SAFE_CALL and HIP_SAFE_CALL [\#4249](https://github.com/kokkos/kokkos/pull/4249)
+- Deprecate Kokkos::Impl::Timer (Kokkos::Timer has been available for a long time) [\#4201](https://github.com/kokkos/kokkos/pull/4201)
+- Deprecate Experimental::MasterLock [\#4094](https://github.com/kokkos/kokkos/pull/4094)
+- Deprecate Kokkos_TaskPolicy.hpp (headers got reorganized, doesn't remove functionality) [\#4011](https://github.com/kokkos/kokkos/pull/4011)
+- Deprecate backward compatibility features [\#3978](https://github.com/kokkos/kokkos/pull/3978)
+- Update and deprecate is_space::host_memory/execution/mirror_space [\#3973](https://github.com/kokkos/kokkos/pull/3973)
+
+
+### Backends and Archs Enhancements:
+
+- Enabling constbitset constructors in kernels [\#4296](https://github.com/kokkos/kokkos/pull/4296)
+- Use ZeroMemset in View constructor to improve performance [\#4226](https://github.com/kokkos/kokkos/pull/4226)
+- Use memset in deep_copy [\#3944](https://github.com/kokkos/kokkos/pull/3944)
+- Add missing fence() calls in resize(View) that effectively do deep_copy(resized, orig) [\#4212](https://github.com/kokkos/kokkos/pull/4212)
+- Avoid allocations in resize and realloc [\#4207](https://github.com/kokkos/kokkos/pull/4207)
+- StaticCsrGraph: use device type instead of execution space to construct views [\#3991](https://github.com/kokkos/kokkos/pull/3991)
+- Consider std::sort when view is accessible from host [\#3929](https://github.com/kokkos/kokkos/pull/3929)
+- Fix CPP20 warnings except for volatile [\#4312](https://github.com/kokkos/kokkos/pull/4312)
+
+#### SYCL:
+- Introduce SYCLHostUSMSpace [\#4268](https://github.com/kokkos/kokkos/pull/4268)
+- Implement SYCL TeamPolicy for vector_size > 1 [\#4183](https://github.com/kokkos/kokkos/pull/4183)
+- Enable 64bit ranges for SYCL [\#4211](https://github.com/kokkos/kokkos/pull/4211)
+- Don't print SYCL device info in execution space intialization [\#4168](https://github.com/kokkos/kokkos/pull/4168)
+- Improve SYCL MDRangePolicy performance [\#4161](https://github.com/kokkos/kokkos/pull/4161)
+- Use sub_groups in SYCL parallel_scan [\#4147](https://github.com/kokkos/kokkos/pull/4147)
+- Implement subgroup reduction for SYCL RangePolicy parallel_reduce [\#3940](https://github.com/kokkos/kokkos/pull/3940)
+- Use DPC++ broadcast extension in SYCL team_broadcast [\#4103](https://github.com/kokkos/kokkos/pull/4103)
+- Only fence in SYCL parallel_reduce for non-device-accessible result_ptr [\#4089](https://github.com/kokkos/kokkos/pull/4089)
+- Improve fencing behavior in SYCL backend [\#4088](https://github.com/kokkos/kokkos/pull/4088)
+- Fence all registered SYCL queues before deallocating memory [\#4086](https://github.com/kokkos/kokkos/pull/4086)
+- Implement SYCL::print_configuration [\#3992](https://github.com/kokkos/kokkos/pull/3992)
+- Reuse scratch memory in parallel_scan and TeamPolicy (decreases memory footprint) [\#3899](https://github.com/kokkos/kokkos/pull/3899) [\#3889](https://github.com/kokkos/kokkos/pull/3889)
+
+#### CUDA:
+- Cuda improve heuristic for blocksize [\#4271](https://github.com/kokkos/kokkos/pull/4271)
+- Don't use [[deprecated]] for nvcc [\#4229](https://github.com/kokkos/kokkos/pull/4229)
+- Improve error message for NVHPC as host compiler [\#4227](https://github.com/kokkos/kokkos/pull/4227)
+- Update support for cuda reductions to work with types < 4bytes [\#4156](https://github.com/kokkos/kokkos/pull/4156)
+- Fix incompatible team size deduction in rare cases parallel_reduce [\#4142](https://github.com/kokkos/kokkos/pull/4142)
+- Remove UVM usage in DynamicView [\#4129](https://github.com/kokkos/kokkos/pull/4129)
+- Remove dependency between core and containers [\#4114](https://github.com/kokkos/kokkos/pull/4114)
+- Adding opt-in CudaMallocSync support when using CUDA version >= 11.2 [\#4026](https://github.com/kokkos/kokkos/pull/4026) [\#4233](https://github.com/kokkos/kokkos/pull/4233)
+- Fix a potential race condition in the CUDA backend [\#3999](https://github.com/kokkos/kokkos/pull/3999)
+
+#### HIP:
+- Implement new blocksize deduction method for HIP Backend [\#3953](https://github.com/kokkos/kokkos/pull/3953)
+- Add multiple LaunchMechanism [\#3820](https://github.com/kokkos/kokkos/pull/3820)
+- Make HIP backend thread-safe [\#4170](https://github.com/kokkos/kokkos/pull/4170)
+
+#### Serial:
+- Refactor Serial backend and fix thread-safety issue [\#4053](https://github.com/kokkos/kokkos/pull/4053)
+
+#### OpenMPTarget:
+- OpenMPTarget: support array reductions in RangePolicy [\#4040](https://github.com/kokkos/kokkos/pull/4040)
+- OpenMPTarget: add MDRange parallel_reduce [\#4032](https://github.com/kokkos/kokkos/pull/4032)
+- OpenMPTarget: Fix bug in for the case of a reducer. [\#4044](https://github.com/kokkos/kokkos/pull/4044)
+- OpenMPTarget: verify process fix [\#4041](https://github.com/kokkos/kokkos/pull/4041)
+
+### Implemented enhancements BuildSystem
+
+#### Important BuildSystem Updates:
+- Use hipcc architecture autodetection when Kokkos_ARCH is not set [\#3941](https://github.com/kokkos/kokkos/pull/3941)
+- Introduce Kokkos_ENABLE_DEPRECATION_WARNINGS and remove deprecated code with Kokkos_ENABLE_DEPRECATED_CODE_3 [\#4106](https://github.com/kokkos/kokkos/pull/4106) [\#3855](https://github.com/kokkos/kokkos/pull/3855)
+
+#### Other Improvements:
+- Add allow-unsupported-compiler flag to nvcc-wrapper [\#4298](https://github.com/kokkos/kokkos/pull/4298)
+- nvcc_wrapper: fix errors in argument handling [\#3993](https://github.com/kokkos/kokkos/pull/3993)
+- Adds support for -time=<file> and -time <file> in nvcc_wrapper [\#4015](https://github.com/kokkos/kokkos/pull/4015)
+- nvcc_wrapper: suppress duplicates of GPU architecture and RDC flags [\#3968](https://github.com/kokkos/kokkos/pull/3968)
+- Fix TMPDIR support in nvcc_wrapper [\#3792](https://github.com/kokkos/kokkos/pull/3792)
+- NVHPC: update PGI compiler arch flags [\#4133](https://github.com/kokkos/kokkos/pull/4133)
+- Replace PGI with NVHPC (works for both) [\#4196](https://github.com/kokkos/kokkos/pull/4196)
+- Make sure that KOKKOS_CXX_HOST_COMPILER_ID is defined [\#4235](https://github.com/kokkos/kokkos/pull/4235)
+- Add options to Makefile builds for deprecated code and warnings [\#4215](https://github.com/kokkos/kokkos/pull/4215)
+- Use KOKKOS_CXX_HOST_COMPILER_ID for identifying CPU arch flags [\#4199](https://github.com/kokkos/kokkos/pull/4199)
+- Added support for Cray Clang to Makefile.kokkos [\#4176](https://github.com/kokkos/kokkos/pull/4176)
+- Add XLClang as compiler [\#4120](https://github.com/kokkos/kokkos/pull/4120)
+- Keep quoted compiler flags when passing to Trilinos [\#3987](https://github.com/kokkos/kokkos/pull/3987)
+- Add support for AMD Zen3 CPU architecture [\#3972](https://github.com/kokkos/kokkos/pull/3972)
+- Rename IntelClang to IntelLLVM [\#3945](https://github.com/kokkos/kokkos/pull/3945)
+- Add cppcoreguidelines-pro-type-cstyle-cast to clang-tidy [\#3522](https://github.com/kokkos/kokkos/pull/3522)
+- Add sve bit size definition for A64FX [\#3947](https://github.com/kokkos/kokkos/pull/3947) [\#3946](https://github.com/kokkos/kokkos/pull/3946)
+- Remove KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES [\#4150](https://github.com/kokkos/kokkos/pull/4150)
+
+### Other Changes:
+
+#### Tool Enhancements:
+
+- Retrieve original value from a point in a MultidimensionalSparseTuningProblem [\#3977](https://github.com/kokkos/kokkos/pull/3977)
+- Allow extension of built-in tuners with additional tuning axes [\#3961](https://github.com/kokkos/kokkos/pull/3961)
+- Added a categorical tuner [\#3955](https://github.com/kokkos/kokkos/pull/3955)
+
+
+#### Miscellaneous:
+
+- hpcbind: Use double quotes around $@ when invoking user command [\#4284](https://github.com/kokkos/kokkos/pull/4284)
+- Add file and line to error message [\#3985](https://github.com/kokkos/kokkos/pull/3985)
+- Fix compiler warnings when compiling with nvc++ [\#4198](https://github.com/kokkos/kokkos/pull/4198)
+- Add OpenMPTarget CI build on AMD GPUs [\#4055](https://github.com/kokkos/kokkos/pull/4055)
+- CI: icpx is now part of intel container [\#4002](https://github.com/kokkos/kokkos/pull/4002)
+
+### Incompatibilities:
+
+- Remove pre CUDA 9 KOKKOS_IMPL_CUDA_* macros [\#4138](https://github.com/kokkos/kokkos/pull/4138)
+
+### Bug Fixes:
+- UnorderedMap::clear() should zero the size() [\#4130](https://github.com/kokkos/kokkos/pull/4130)
+- Add memory fence for HostSharedPtr::cleanup() [\#4144](https://github.com/kokkos/kokkos/pull/4144)
+- SYCL: Fix race conditions in TeamPolicy::parallel_reduce [\#4418](https://github.com/kokkos/kokkos/pull/4418)
+- Adding missing memory fence to serial exec space fence. [\#4292](https://github.com/kokkos/kokkos/pull/4292)
+- Fix using external SYCL queues in tests [\#4291](https://github.com/kokkos/kokkos/pull/4291)
+- Fix digits10 bug [\#4281](https://github.com/kokkos/kokkos/pull/4281)
+- Fixes constexpr errors with frounding-math on gcc < 10. [\#4278](https://github.com/kokkos/kokkos/pull/4278)
+- Fix compiler flags for PGI/NVHPC [\#4264](https://github.com/kokkos/kokkos/pull/4264)
+- Fix Zen2/3 also implying Zen Arch with Makefiles [\#4260](https://github.com/kokkos/kokkos/pull/4260)
+- Kokkos_Cuda.hpp: Fix shadow warning with cuda/11.0 [\#4252](https://github.com/kokkos/kokkos/pull/4252)
+- Fix issue w/ static initialization of function attributes [\#4242](https://github.com/kokkos/kokkos/pull/4242)
+- Disable long double hypot test on Power systems [\#4221](https://github.com/kokkos/kokkos/pull/4221)
+- Fix false sharing in random pool [\#4218](https://github.com/kokkos/kokkos/pull/4218)
+- Fix a missing memory_fence for debug shared alloc code [\#4216](https://github.com/kokkos/kokkos/pull/4216)
+- Fix two xl issues [\#4179](https://github.com/kokkos/kokkos/pull/4179)
+- Makefile.kokkos: fix (standard_in) 1: syntax error [\#4173](https://github.com/kokkos/kokkos/pull/4173)
+- Fixes for query_device example [\#4172](https://github.com/kokkos/kokkos/pull/4172)
+- Fix a bug when using HIP atomic with Kokkos::Complex [\#4159](https://github.com/kokkos/kokkos/pull/4159)
+- Fix mistaken logic in pthread creation [\#4157](https://github.com/kokkos/kokkos/pull/4157)
+- Define KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION when requesting Kokkos_ENABLE_AGGRESSIVE_VECTORIZATION=ON [\#4107](https://github.com/kokkos/kokkos/pull/4107)
+- Fix compilation with latest MSVC version [\#4102](https://github.com/kokkos/kokkos/pull/4102)
+- Fix incorrect macro definitions when compiling with Intel compiler on Windows [\#4087](https://github.com/kokkos/kokkos/pull/4087)
+- Fixup global buffer overflow in hand rolled string manipulation [\#4070](https://github.com/kokkos/kokkos/pull/4070)
+- Fixup heap buffer overflow in cmd line args parsing unit tests [\#4069](https://github.com/kokkos/kokkos/pull/4069)
+- Only add quotes in compiler flags for Trilinos if necessary [\#4067](https://github.com/kokkos/kokkos/pull/4067)
+- Fixed invocation of tools init callbacks [\#4061](https://github.com/kokkos/kokkos/pull/4061)
+- Work around SYCL JIT compiler issues with static variables [\#4013](https://github.com/kokkos/kokkos/pull/4013)
+- Fix TestDetectionIdiom.cpp test inclusion for Trilinos/TriBITS [\#4010](https://github.com/kokkos/kokkos/pull/4010)
+- Fixup allocation headers with OpenMPTarget backend [\#4003](https://github.com/kokkos/kokkos/pull/4003)
+- Add missing specialization for OMPT to Kokkos Random [\#3967](https://github.com/kokkos/kokkos/pull/3967)
+- Disable hypot long double test on power arches [\#3962](https://github.com/kokkos/kokkos/pull/3962)
+- Use different EBO workaround for MSVC (rebased) [\#3924](https://github.com/kokkos/kokkos/pull/3924)
+- Fix SYCL Kokkos::Profiling::(de)allocateData calls [\#3928](https://github.com/kokkos/kokkos/pull/3928)
+
 ## [3.4.01](https://github.com/kokkos/kokkos/tree/3.4.01) (2021-05-19)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/3.4.00...3.4.01)
 
diff --git a/packages/kokkos/CMakeLists.txt b/packages/kokkos/CMakeLists.txt
index 9452027d8ee99592293cba43b7a60a7d2c0c3bbc..1b6753f983db34e64bc1e10bfc3f008c6fec5ede 100644
--- a/packages/kokkos/CMakeLists.txt
+++ b/packages/kokkos/CMakeLists.txt
@@ -111,8 +111,8 @@ ENDIF()
 
 
 set(Kokkos_VERSION_MAJOR 3)
-set(Kokkos_VERSION_MINOR 4)
-set(Kokkos_VERSION_PATCH 01)
+set(Kokkos_VERSION_MINOR 5)
+set(Kokkos_VERSION_PATCH 00)
 set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
 math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
 
@@ -210,7 +210,12 @@ IF (KOKKOS_HAS_TRILINOS)
   # which needs another workaround.
   SET(KOKKOS_COMPILE_OPTIONS_TMP)
   FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS})
-    LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP \"${OPTION}\")
+    STRING(FIND "${OPTION}" " " OPTION_HAS_WHITESPACE)
+    IF(OPTION_HAS_WHITESPACE EQUAL -1)
+      LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "${OPTION}")
+    ELSE()
+      LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "\"${OPTION}\"")
+    ENDIF()
   ENDFOREACH()
   STRING(REPLACE ";" " " KOKKOSCORE_COMPILE_OPTIONS "${KOKKOS_COMPILE_OPTIONS_TMP}")
   LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_COMPILE_OPTIONS})
diff --git a/packages/kokkos/Makefile.kokkos b/packages/kokkos/Makefile.kokkos
index bda8572073a326320dab54084080bd57115ce791..7ab18f5894e8880bd0584a4815f2260e07772cfb 100644
--- a/packages/kokkos/Makefile.kokkos
+++ b/packages/kokkos/Makefile.kokkos
@@ -1,19 +1,19 @@
 # Default settings common options.
 
 KOKKOS_VERSION_MAJOR = 3
-KOKKOS_VERSION_MINOR = 4
-KOKKOS_VERSION_PATCH = 01
+KOKKOS_VERSION_MINOR = 5
+KOKKOS_VERSION_PATCH = 00
 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
 
-# Options: Cuda,HIP,OpenMP,Pthread,Serial
+# Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Pthread,Serial
 #KOKKOS_DEVICES ?= "OpenMP"
 KOKKOS_DEVICES ?= "Pthread"
-# Options: 
+# Options:
 # Intel:    KNC,KNL,SNB,HSW,BDW,SKX
 # NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
 # ARM:      ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
 # IBM:      BGQ,Power7,Power8,Power9
-# AMD-GPUS: Vega900,Vega906,Vega908
+# AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3
 KOKKOS_ARCH ?= ""
 # Options: yes,no
@@ -22,7 +22,7 @@ KOKKOS_DEBUG ?= "no"
 KOKKOS_USE_TPLS ?= ""
 # Options: c++14,c++1y,c++17,c++1z,c++2a
 KOKKOS_CXX_STANDARD ?= "c++14"
-# Options: aggressive_vectorization,disable_profiling,enable_large_mem_tests,disable_complex_align
+# Options: aggressive_vectorization,disable_profiling,enable_large_mem_tests,disable_complex_align,disable_deprecated_code,enable_deprecation_warnings
 KOKKOS_OPTIONS ?= ""
 KOKKOS_CMAKE ?= "no"
 KOKKOS_TRIBITS ?= "no"
@@ -70,7 +70,7 @@ KOKKOS_INTERNAL_USE_MEMKIND := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),exper
 
 # Check for advanced settings.
 KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),compiler_warnings)
-KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(call kokkos_has_string,$(KOKKOS_OPTIONS),aggressive_vectorization)
+KOKKOS_INTERNAL_AGGRESSIVE_VECTORIZATION := $(call kokkos_has_string,$(KOKKOS_OPTIONS),aggressive_vectorization)
 KOKKOS_INTERNAL_ENABLE_TUNING := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_tuning)
 KOKKOS_INTERNAL_DISABLE_COMPLEX_ALIGN := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_complex_align)
 KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_dualview_modify_check)
@@ -82,6 +82,9 @@ KOKKOS_INTERNAL_CUDA_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS
 KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_lambda)
 KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_constexpr)
 KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH := $(call kokkos_has_string,$(KOKKOS_HPX_OPTIONS),enable_async_dispatch)
+KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_desul_atomics)
+KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_deprecated_code)
+KOKKOS_INTERNAL_ENABLE_DEPRECATION_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_deprecation_warnings)
 
 KOKKOS_INTERNAL_HIP_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_HIP_OPTIONS),rdc)
 
@@ -102,6 +105,7 @@ endif
 # Check for other Execution Spaces.
 KOKKOS_INTERNAL_USE_CUDA := $(call kokkos_has_string,$(KOKKOS_DEVICES),Cuda)
 KOKKOS_INTERNAL_USE_HIP := $(call kokkos_has_string,$(KOKKOS_DEVICES),HIP)
+KOKKOS_INTERNAL_USE_SYCL := $(call kokkos_has_string,$(KOKKOS_DEVICES),SYCL)
 KOKKOS_INTERNAL_USE_OPENMPTARGET := $(call kokkos_has_string,$(KOKKOS_DEVICES),OpenMPTarget)
 
 KOKKOS_DEVICELIST =
@@ -123,11 +127,18 @@ endif
 ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
   KOKKOS_DEVICELIST += HIP
 endif
+KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER := $(shell expr $(KOKKOS_INTERNAL_ENABLE_CXX17) \
+                                                  + $(KOKKOS_INTERNAL_ENABLE_CXX20) \
+                                                  + $(KOKKOS_INTERNAL_ENABLE_CXX2A))
+ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+  KOKKOS_DEVICELIST += SYCL
+  ifneq ($(KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER), 1)
+    $(error SYCL backend requires C++17 or newer)
+  endif
+
+endif
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
   KOKKOS_DEVICELIST += OPENMPTARGET
-  KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER := $(shell expr $(KOKKOS_INTERNAL_ENABLE_CXX17) \
-                                                    + $(KOKKOS_INTERNAL_ENABLE_CXX20) \
-                                                    + $(KOKKOS_INTERNAL_ENABLE_CXX2A))
   ifneq ($(KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER), 1)
     $(error OpenMPTarget backend requires C++17 or newer)
   endif
@@ -158,6 +169,8 @@ KOKKOS_INTERNAL_COMPILER_XL          := $(strip $(shell $(CXX) -qversion       2
 KOKKOS_INTERNAL_COMPILER_CRAY        := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "CC-"))
 KOKKOS_INTERNAL_COMPILER_NVCC        := $(strip $(shell echo "$(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep -c nvcc)>0" | bc))
 KOKKOS_INTERNAL_COMPILER_CLANG       := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),clang)
+KOKKOS_INTERNAL_COMPILER_CRAY_CLANG  := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "clang++"))
+KOKKOS_INTERNAL_COMPILER_INTEL_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),oneAPI)
 KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple clang)
 KOKKOS_INTERNAL_COMPILER_HCC         := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC)
 KOKKOS_INTERNAL_COMPILER_GCC         := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),GCC)
@@ -237,7 +250,11 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
   KOKKOS_INTERNAL_OPENMP_FLAG := -mp
 else
   ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY_CLANG), 1)
+    KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
+    else
     KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp
+    endif
   else
     ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1)
       KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp
@@ -249,7 +266,11 @@ else
           # OpenMP is turned on by default in Cray compiler environment.
           KOKKOS_INTERNAL_OPENMP_FLAG :=
         else
-          KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
+          ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL_CLANG), 1)
+            KOKKOS_INTERNAL_OPENMP_FLAG := -fiopenmp
+          else
+            KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
+          endif
         endif
       endif
     endif
@@ -307,6 +328,13 @@ KOKKOS_INTERNAL_USE_ARCH_BDW := $(call kokkos_has_string,$(KOKKOS_ARCH),BDW)
 KOKKOS_INTERNAL_USE_ARCH_SKX := $(call kokkos_has_string,$(KOKKOS_ARCH),SKX)
 KOKKOS_INTERNAL_USE_ARCH_KNL := $(call kokkos_has_string,$(KOKKOS_ARCH),KNL)
 
+KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen9)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen11)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen12LP)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelDG1)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelXeHP)
+
 # NVIDIA based.
 NVCC_WRAPPER := $(KOKKOS_PATH)/bin/nvcc_wrapper
 KOKKOS_INTERNAL_USE_ARCH_KEPLER30 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler30)
@@ -374,20 +402,25 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_
 KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX)
 KOKKOS_INTERNAL_USE_ARCH_ZEN3 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen3)
 KOKKOS_INTERNAL_USE_ARCH_ZEN2 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen2)
-KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen)
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN3), 0)
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN2), 0)
+    KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen)
+  endif
+endif
 KOKKOS_INTERNAL_USE_ARCH_VEGA900 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega900)
 KOKKOS_INTERNAL_USE_ARCH_VEGA906 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega906)
 KOKKOS_INTERNAL_USE_ARCH_VEGA908 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega908)
+KOKKOS_INTERNAL_USE_ARCH_VEGA90A := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega90A)
 
 # Any AVX?
 KOKKOS_INTERNAL_USE_ARCH_SSE42      := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM))
 KOKKOS_INTERNAL_USE_ARCH_AVX        := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX))
-KOKKOS_INTERNAL_USE_ARCH_AVX2       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2)) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
+KOKKOS_INTERNAL_USE_ARCH_AVX2       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
 KOKKOS_INTERNAL_USE_ARCH_AVX512MIC  := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL))
 KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SKX))
 
 # Decide what ISA level we are able to support.
-KOKKOS_INTERNAL_USE_ISA_X86_64    := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2)) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
+KOKKOS_INTERNAL_USE_ISA_X86_64    := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
 KOKKOS_INTERNAL_USE_ISA_KNC       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNC))
 KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER8) + $(KOKKOS_INTERNAL_USE_ARCH_POWER9))
 KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER7))
@@ -396,8 +429,8 @@ KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POW
 KOKKOS_INTERNAL_USE_TM            := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_SKX))
 
 # Incompatible flags?
-KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc ))
-KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc))
+KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc)
+KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1") | bc)
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1)
   $(error Defined Multiple Host architectures: KOKKOS_ARCH=$(KOKKOS_ARCH) )
@@ -432,6 +465,10 @@ KOKKOS_LINK_FLAGS =
 KOKKOS_SRC =
 KOKKOS_HEADERS =
 
+#ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
+  KOKKOS_LIBS += -latomic
+#endif
+
 # Generating the KokkosCore_config.h file.
 
 KOKKOS_INTERNAL_CONFIG_TMP=KokkosCore_config.tmp
@@ -468,6 +505,10 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
   tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_HIP')
 endif
 
+ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+  tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_SYCL')
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
   tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMPTARGET')
   ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
@@ -523,6 +564,12 @@ endif
 
 #only add the c++ standard flags if this is not CMake
 tmp := $(call kokkos_append_header,"/* General Settings */")
+ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEPRECATED_CODE_3")
+endif
+ifeq ($(KOKKOS_INTERNAL_ENABLE_DEPRECATION_WARNINGS), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEPRECATION_WARNINGS")
+endif
 ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX14), 1)
 ifneq ($(KOKKOS_STANDALONE_CMAKE), yes)
   KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX14_FLAG)
@@ -625,8 +672,10 @@ endif
 
 tmp := $(call kokkos_append_header,"/* Optimization Settings */")
 
-ifeq ($(KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION), 1)
+ifeq ($(KOKKOS_INTERNAL_AGGRESSIVE_VECTORIZATION), 1)
+  # deprecated
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION")
 endif
 
 tmp := $(call kokkos_append_header,"/* Cuda Settings */")
@@ -1156,6 +1205,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
     tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA908")
     KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx908
   endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA90A), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HIP 90A")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA90A")
+    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx90a
+  endif
 
 
   KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.cpp)
@@ -1174,6 +1228,52 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
   endif
 endif
 
+# Figure out the architecture flag for SYCL.
+ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+  # Lets start with adding architecture defines
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9-"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN9")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN11")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen11"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN12LP")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen12lp"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_DG1")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device dg1"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_XEHP")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device xehp"
+  endif
+
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/SYCL/*.cpp)
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/SYCL/*.hpp)
+
+  KOKKOS_CXXFLAGS+=-fsycl -fno-sycl-id-queries-fit-in-int -fsycl-unnamed-lambda
+  KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_SYCL_ARCH_FLAG)
+  KOKKOS_LDFLAGS+=-fsycl
+  KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_SYCL_ARCH_FLAG)
+endif
+
+ifeq ($(KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_DESUL_ATOMICS")
+endif
 
 KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1)
 
@@ -1185,57 +1285,63 @@ endif
 
 ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0)
   tmp := $(shell cp KokkosCore_config.tmp KokkosCore_config.h)
-endif
 
-# Functions for generating config header file
-kokkos_start_config_header = $(shell sed 's~@INCLUDE_NEXT_FILE@~~g' $(KOKKOS_PATH)/cmake/KokkosCore_Config_HeaderSet.in > $1)
-kokkos_update_config_header = $(shell sed 's~@HEADER_GUARD_TAG@~$1~g' $2 > $3)
-kokkos_append_config_header = $(shell echo $1 >> $2))
-tmp := $(call kokkos_start_config_header, "KokkosCore_Config_FwdBackend.tmp")
-tmp := $(call kokkos_start_config_header, "KokkosCore_Config_SetupBackend.tmp")
-tmp := $(call kokkos_start_config_header, "KokkosCore_Config_DeclareBackend.tmp")
-tmp := $(call kokkos_start_config_header, "KokkosCore_Config_PostInclude.tmp")
-tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp")
-tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp")
-tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp")
-tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp")
-ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_CUDA.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_CUDA.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_Cuda.hpp>","KokkosCore_Config_SetupBackend.hpp")
-   ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1)
-   else
-   endif
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMPTARGET.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMPTARGET.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HIP.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HIP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_HIP.hpp>","KokkosCore_Config_SetupBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMP.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_THREADS.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_THREADS.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HPX.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HPX.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_SERIAL.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_SERIAL.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HBWSpace.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HBWSpace.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  # Functions for generating config header file
+  kokkos_start_config_header = $(shell sed 's~@INCLUDE_NEXT_FILE@~~g' $(KOKKOS_PATH)/cmake/KokkosCore_Config_HeaderSet.in > $1)
+  kokkos_update_config_header = $(shell sed 's~@HEADER_GUARD_TAG@~$1~g' $2 > $3)
+  kokkos_append_config_header = $(shell echo $1 >> $2))
+  tmp := $(call kokkos_start_config_header, "KokkosCore_Config_FwdBackend.tmp")
+  tmp := $(call kokkos_start_config_header, "KokkosCore_Config_SetupBackend.tmp")
+  tmp := $(call kokkos_start_config_header, "KokkosCore_Config_DeclareBackend.tmp")
+  tmp := $(call kokkos_start_config_header, "KokkosCore_Config_PostInclude.tmp")
+  tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp")
+  tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp")
+  tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp")
+  tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp")
+  ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_CUDA.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_CUDA.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_Cuda.hpp>","KokkosCore_Config_SetupBackend.hpp")
+    ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1)
+    else
+    endif
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMPTARGET.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMPTARGET.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_SYCL.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_SYCL.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_SYCL.hpp>","KokkosCore_Config_SetupBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HIP.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HIP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_HIP.hpp>","KokkosCore_Config_SetupBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMP.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_THREADS.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_THREADS.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HPX.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HPX.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_SERIAL.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_SERIAL.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HBWSpace.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HBWSpace.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
 endif
+
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp)
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp)
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp)
@@ -1247,6 +1353,9 @@ KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp)
 
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
   KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
+  ifeq ($(KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS), 1)
+    KOKKOS_SRC += $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_CUDA.cpp
+  endif
   KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
   ifneq ($(CUDA_PATH),)
     KOKKOS_CPPLAGS += -I$(CUDA_PATH)/include
diff --git a/packages/kokkos/Makefile.targets b/packages/kokkos/Makefile.targets
index cf9fc242420e1dbbb519b3312cf1a4c3b4354738..93854d0cf150c97d5058422b7ca9ff28ce2ba8b6 100644
--- a/packages/kokkos/Makefile.targets
+++ b/packages/kokkos/Makefile.targets
@@ -48,6 +48,17 @@ Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
 Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
+Lock_Array_CUDA.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_CUDA.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_CUDA.cpp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+Kokkos_SYCL.o : $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL.cpp
+Kokkos_SYCL_Space.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Space.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Space.cpp
+Kokkos_SYCL_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Instance.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Instance.cpp
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
diff --git a/packages/kokkos/README.md b/packages/kokkos/README.md
index d55ef2caac93ae6803aa97925ea7b081d3f05ca3..673f4627125223f0d09f1e184d9942c0e6a0b7ff 100644
--- a/packages/kokkos/README.md
+++ b/packages/kokkos/README.md
@@ -7,7 +7,7 @@ applications targeting all major HPC platforms. For that purpose it provides
 abstractions for both parallel execution of code and data management.
 Kokkos is designed to target complex node architectures with N-level memory
 hierarchies and multiple types of execution resources. It currently can use
-CUDA, HPX, OpenMP and Pthreads as backend programming models with several other
+CUDA, HIP, SYCL, HPX, OpenMP and C++ threads as backend programming models with several other
 backends in development.
 
 Kokkos Core is part of the Kokkos C++ Performance Portability Programming EcoSystem,
@@ -16,29 +16,19 @@ profiling and debugging tools (https://github.com/kokkos/kokkos-tools).
 
 # Learning about Kokkos
 
-A programming guide can be found on the Wiki, the API reference is under development.
+The best way to start learning about Kokkos is going through the Kokkos Lectures.
+They are online available at https://kokkos.link/the-lectures and contain a mix
+of lecture videos and hands-on exercises covering all the important Kokkos Ecosystem
+capabilities.
+
+A programming guide and API reference can be found on the Wiki
+(https://github.com/kokkos/kokkos/wiki).
 
 For questions find us on Slack: https://kokkosteam.slack.com or open a github issue.
 
 For non-public questions send an email to
 crtrott(at)sandia.gov
 
-A separate repository with extensive tutorial material can be found under
-https://github.com/kokkos/kokkos-tutorials.
-
-Furthermore, the 'example/tutorial' directory provides step by step tutorial
-examples which explain many of the features of Kokkos. They work with
-simple Makefiles. To build with g++ and OpenMP simply type 'make'
-in the 'example/tutorial' directory. This will build all examples in the
-subfolders. To change the build options refer to the Programming Guide
-in the compilation section.
-
-To learn more about Kokkos consider watching one of our presentations:
-* GTC 2015:
-  - http://on-demand.gputechconf.com/gtc/2015/video/S5166.html
-  - http://on-demand.gputechconf.com/gtc/2015/presentation/S5166-H-Carter-Edwards.pdf
-
-
 # Contributing to Kokkos
 
 We are open and try to encourage contributions from external developers.
@@ -53,57 +43,40 @@ For specifics see the LICENSE file contained in the repository or distribution.
 
 # Requirements
 
-### Primary tested compilers on X86 are:
-* GCC 5.3.0
-* GCC 5.4.0
-* GCC 5.5.0
-* GCC 6.1.0
-* GCC 7.2.0
-* GCC 7.3.0
-* GCC 8.1.0
-* Intel 17.0.1
-* Intel 17.4.196
-* Intel 18.2.128
-* Clang 4.0.0
-* Clang 6.0.0 for CUDA (CUDA Toolkit 9.0)
-* Clang 7.0.0 for CUDA (CUDA Toolkit 9.1)
-* Clang 8.0.0 for CUDA (CUDA Toolkit 9.2)
-* PGI 18.7
-* NVCC 9.1 for CUDA (with gcc 6.1.0)
-* NVCC 9.2 for CUDA (with gcc 7.2.0)
-* NVCC 10.0 for CUDA (with gcc 7.4.0)
-* NVCC 10.1 for CUDA (with gcc 7.4.0)
-* NVCC 11.0 for CUDA (with gcc 8.4.0)
-
-### Primary tested compilers on Power 8 are:
-* GCC 6.4.0 (OpenMP,Serial)
-* GCC 7.2.0 (OpenMP,Serial)
-* IBM XL 16.1.0 (OpenMP, Serial)
-* NVCC 9.2.88 for CUDA (with gcc 7.2.0 and XL 16.1.0)
-
-### Primary tested compilers on Intel KNL are:
-* Intel 17.2.174 (with gcc 6.2.0 and 6.4.0)
-* Intel 18.2.199 (with gcc 6.2.0 and 6.4.0)
-
-### Primary tested compilers on ARM (Cavium ThunderX2)
-* GCC 7.2.0
-* ARM/Clang 18.4.0
-
-### Other compilers working:
-* X86:
-    * Cygwin 2.1.0 64bit with gcc 4.9.3
-    * GCC 8.1.0 (not warning free)
-
-### Known non-working combinations:
-* Power8:
-    * Pthreads backend
-* ARM
-    * Pthreads backend
+### Minimum Compiler Versions
+
+Generally Kokkos should work with all compiler versions newer than the minimum.
+However as in all sufficiently complex enough code, we have to work around compiler
+bugs with almost all compilers. So compiler versions we don't test may have issues
+we are unaware off.
+
+* GCC: 5.3.0
+* Clang: 4.0.0
+* Intel: 17.0.1
+* NVCC: 9.2.88
+* NVC++: 21.5
+* ROCM: 4.3
+* MSVC: 19.29
+* IBM XL: 16.1.1
+* Fujitsu: 4.5.0
+* ARM/Clang 20.1
+
+### Primary Tested Compilers
+
+* GCC: 5.3.0, 6.1.0, 7.3.0, 8.3, 9.2, 10.0
+* NVCC: 9.2.88, 10.1, 11.0
+* Clang: 8.0.0, 9.0.0, 10.0.0, 12.0.0
+* Intel 17.4, 18.1, 19.5
+* MSVC: 19.29
+* ARM/Clang: 20.1
+* IBM XL: 16.1.1
+* ROCM: 4.3.0
 
 ### Build system:
-* CMake >= 3.10: required
-* CMake >= 3.13: recommended
+
+* CMake >= 3.16: required
 * CMake >= 3.18: Fortran linkage. This does not affect most mixed Fortran/Kokkos builds. See [build issues](BUILD.md#KnownIssues).
+* CMake >= 3.21.1 for NVC++
 
 Primary tested compiler are passing in release mode
 with warnings as errors. They also are tested with a comprehensive set of
@@ -153,7 +126,6 @@ cmake $srcdir \
   -DCMAKE_INSTALL_PREFIX=$path_to_install \
   -DKokkos_ENABLE_OPENMP=On \
   -DKokkos_ARCH_HSW=On \
-  -DKokkos_ENABLE_HWLOC=On \
   -DKokkos_HWLOC_DIR=$path_to_hwloc
 ````
 then simply type `make install`. The Kokkos CMake package will then be installed in `$path_to_install` to be used by downstream packages.
@@ -212,23 +184,8 @@ where `...` is the unique spec identifying the particular Kokkos configuration a
 Some more details can found in the Kokkos spack [documentation](Spack.md) or the Spack [website](https://spack.readthedocs.io/en/latest).
 
 ## Raw Makefile
-A bash script is provided to generate raw makefiles.
-To install Kokkos as a library create a build directory and run the following
-````bash
-> $KOKKOS_PATH/generate_makefile.bash --prefix=$path_to_install
-````
-Once the Makefile is generated, run:
-````bash
-> make kokkoslib
-> make install
-````
-To additionally run the unit tests:
-````bash
-> make build-test
-> make test
-````
-Run `generate_makefile.bash --help` for more detailed options such as
-changing the device type for which to build.
+
+Raw Makefiles are only supported via inline builds. See below.
 
 ## Inline Builds vs. Installed Package
 For individual projects, it may be preferable to build Kokkos inline rather than link to an installed package.
@@ -268,6 +225,35 @@ more than a single GPU is used by a single process.
 
 If you publish work which mentions Kokkos, please cite the following paper:
 
+````BibTex
+@ARTICLE{9485033,
+  author={Trott, Christian R. and Lebrun-Grandié, Damien and Arndt, Daniel and Ciesko, Jan and Dang, Vinh and Ellingwood, Nathan and Gayatri, Rahulkumar and Harvey, Evan and Hollman, Daisy S. and Ibanez, Dan and Liber, Nevin and Madsen, Jonathan and Miles, Jeff and Poliakoff, David and Powell, Amy and Rajamanickam, Sivasankaran and Simberg, Mikael and Sunderland, Dan and Turcksin, Bruno and Wilke, Jeremiah},
+  journal={IEEE Transactions on Parallel and Distributed Systems},
+  title={Kokkos 3: Programming Model Extensions for the Exascale Era},
+  year={2022},
+  volume={33},
+  number={4},
+  pages={805-817},
+  doi={10.1109/TPDS.2021.3097283}}
+````
+
+If you use more than one Kokkos EcoSystem package, please also cite:
+
+````BibTex
+@ARTICLE{9502936,
+  author={Trott, Christian and Berger-Vergiat, Luc and Poliakoff, David and Rajamanickam, Sivasankaran and Lebrun-Grandie, Damien and Madsen, Jonathan and Al Awar, Nader and Gligoric, Milos and Shipman, Galen and Womeldorff, Geoff},
+  journal={Computing in Science   Engineering},
+  title={The Kokkos EcoSystem: Comprehensive Performance Portability for High Performance Computing},
+  year={2021},
+  volume={23},
+  number={5},
+  pages={10-18},
+  doi={10.1109/MCSE.2021.3098509}}
+````
+
+
+And if you feel generous: feel free to cite the original Kokkos paper which describes most of the basic Kokkos concepts:
+
 ````BibTeX
 @article{CarterEdwards20143202,
   title = "Kokkos: Enabling manycore performance portability through polymorphic memory access patterns ",
diff --git a/packages/kokkos/algorithms/CMakeLists.txt b/packages/kokkos/algorithms/CMakeLists.txt
index 4df76a1dbbd17eb269694b8bd801184ccc02e047..eb54db8a556453f4afed4c95cf20321e8cbe211e 100644
--- a/packages/kokkos/algorithms/CMakeLists.txt
+++ b/packages/kokkos/algorithms/CMakeLists.txt
@@ -5,9 +5,7 @@ KOKKOS_SUBPACKAGE(Algorithms)
 IF (NOT Kokkos_INSTALL_TESTING)
   ADD_SUBDIRECTORY(src)
 ENDIF()
-IF(NOT (KOKKOS_ENABLE_OPENMPTARGET
-        AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR
-             KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)))
+IF(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
   KOKKOS_ADD_TEST_DIRECTORIES(unit_tests)
 ENDIF()
 
diff --git a/packages/kokkos/algorithms/src/Kokkos_Random.hpp b/packages/kokkos/algorithms/src/Kokkos_Random.hpp
index 55ce19971faf576483da1ec66cedea0735bc8c7a..46b8ab87fabfbeabda12beb3ddabf0eb6aab3482 100644
--- a/packages/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/packages/kokkos/algorithms/src/Kokkos_Random.hpp
@@ -447,6 +447,25 @@ struct rand<Generator, unsigned long long> {
   }
 };
 
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+template <class Generator>
+struct rand<Generator, Kokkos::Experimental::half_t> {
+  using half = Kokkos::Experimental::half_t;
+  KOKKOS_INLINE_FUNCTION
+  static half max() { return half(1.0); }
+  KOKKOS_INLINE_FUNCTION
+  static half draw(Generator& gen) { return half(gen.frand()); }
+  KOKKOS_INLINE_FUNCTION
+  static half draw(Generator& gen, const half& range) {
+    return half(gen.frand(float(range)));
+  }
+  KOKKOS_INLINE_FUNCTION
+  static half draw(Generator& gen, const half& start, const half& end) {
+    return half(gen.frand(float(start), float(end)));
+  }
+};
+#endif  // defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+
 template <class Generator>
 struct rand<Generator, float> {
   KOKKOS_INLINE_FUNCTION
@@ -600,7 +619,7 @@ struct Random_XorShift1024_UseCArrayState<Kokkos::Experimental::OpenMPTarget>
 
 template <class ExecutionSpace>
 struct Random_UniqueIndex {
-  using locks_view_type = View<int*, ExecutionSpace>;
+  using locks_view_type = View<int**, ExecutionSpace>;
   KOKKOS_FUNCTION
   static int get_state_idx(const locks_view_type) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
@@ -615,7 +634,7 @@ struct Random_UniqueIndex {
 #ifdef KOKKOS_ENABLE_CUDA
 template <>
 struct Random_UniqueIndex<Kokkos::Cuda> {
-  using locks_view_type = View<int*, Kokkos::Cuda>;
+  using locks_view_type = View<int**, Kokkos::Cuda>;
   KOKKOS_FUNCTION
   static int get_state_idx(const locks_view_type& locks_) {
 #ifdef __CUDA_ARCH__
@@ -625,7 +644,7 @@ struct Random_UniqueIndex<Kokkos::Cuda> {
                  blockDim.x * blockDim.y * blockDim.z +
              i_offset) %
             locks_.extent(0);
-    while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
+    while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
       i += blockDim.x * blockDim.y * blockDim.z;
       if (i >= static_cast<int>(locks_.extent(0))) {
         i = i_offset;
@@ -643,7 +662,7 @@ struct Random_UniqueIndex<Kokkos::Cuda> {
 #ifdef KOKKOS_ENABLE_HIP
 template <>
 struct Random_UniqueIndex<Kokkos::Experimental::HIP> {
-  using locks_view_type = View<int*, Kokkos::Experimental::HIP>;
+  using locks_view_type = View<int**, Kokkos::Experimental::HIP>;
   KOKKOS_FUNCTION
   static int get_state_idx(const locks_view_type& locks_) {
 #ifdef __HIP_DEVICE_COMPILE__
@@ -653,7 +672,7 @@ struct Random_UniqueIndex<Kokkos::Experimental::HIP> {
                  blockDim.x * blockDim.y * blockDim.z +
              i_offset) %
             locks_.extent(0);
-    while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
+    while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
       i += blockDim.x * blockDim.y * blockDim.z;
       if (i >= static_cast<int>(locks_.extent(0))) {
         i = i_offset;
@@ -671,15 +690,15 @@ struct Random_UniqueIndex<Kokkos::Experimental::HIP> {
 #ifdef KOKKOS_ENABLE_SYCL
 template <>
 struct Random_UniqueIndex<Kokkos::Experimental::SYCL> {
-  using locks_view_type = View<int*, Kokkos::Experimental::SYCL>;
+  using locks_view_type = View<int**, Kokkos::Experimental::SYCL>;
   KOKKOS_FUNCTION
   static int get_state_idx(const locks_view_type& locks_) {
-#ifdef KOKKOS_ARCH_INTEL_GEN
+#ifdef KOKKOS_ARCH_INTEL_GPU
     int i = Kokkos::Impl::clock_tic() % locks_.extent(0);
 #else
     int i = 0;
 #endif
-    while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
+    while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
       i = (i + 1) % static_cast<int>(locks_.extent(0));
     }
     return i;
@@ -690,14 +709,14 @@ struct Random_UniqueIndex<Kokkos::Experimental::SYCL> {
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
 template <>
 struct Random_UniqueIndex<Kokkos::Experimental::OpenMPTarget> {
-  using locks_view_type = View<int*, Kokkos::Experimental::OpenMPTarget>;
+  using locks_view_type = View<int**, Kokkos::Experimental::OpenMPTarget>;
   KOKKOS_FUNCTION
   static int get_state_idx(const locks_view_type& locks) {
     const int team_size = omp_get_num_threads();
     int i               = omp_get_team_num() * team_size + omp_get_thread_num();
     const int lock_size = locks.extent_int(0);
 
-    while (Kokkos::atomic_compare_exchange(&locks(i), 0, 1)) {
+    while (Kokkos::atomic_compare_exchange(&locks(i, 0), 0, 1)) {
       i = (i + 1) % lock_size;
     }
     return i;
@@ -856,18 +875,22 @@ template <class DeviceType = Kokkos::DefaultExecutionSpace>
 class Random_XorShift64_Pool {
  private:
   using execution_space = typename DeviceType::execution_space;
-  using locks_type      = View<int*, execution_space>;
-  using state_data_type = View<uint64_t*, DeviceType>;
+  using locks_type      = View<int**, execution_space>;
+  using state_data_type = View<uint64_t**, DeviceType>;
   locks_type locks_;
   state_data_type state_;
   int num_states_;
+  int padding_;
 
  public:
   using generator_type = Random_XorShift64<DeviceType>;
   using device_type    = DeviceType;
 
   KOKKOS_INLINE_FUNCTION
-  Random_XorShift64_Pool() { num_states_ = 0; }
+  Random_XorShift64_Pool() {
+    num_states_ = 0;
+    padding_    = 0;
+  }
   Random_XorShift64_Pool(uint64_t seed) {
     num_states_ = 0;
 
@@ -883,16 +906,22 @@ class Random_XorShift64_Pool {
     locks_      = src.locks_;
     state_      = src.state_;
     num_states_ = src.num_states_;
+    padding_    = src.padding_;
     return *this;
   }
 
   void init(uint64_t seed, int num_states) {
     if (seed == 0) seed = uint64_t(1318319);
-
+    // I only want to pad on CPU like archs (less than 1000 threads). 64 is a
+    // magic number, or random number I just wanted something not too large and
+    // not too small. 64 sounded fine.
+    padding_    = num_states < 1000 ? 64 : 1;
     num_states_ = num_states;
 
-    locks_ = locks_type("Kokkos::Random_XorShift64::locks", num_states_);
-    state_ = state_data_type("Kokkos::Random_XorShift64::state", num_states_);
+    locks_ =
+        locks_type("Kokkos::Random_XorShift64::locks", num_states, padding_);
+    state_ = state_data_type("Kokkos::Random_XorShift64::state", num_states_,
+                             padding_);
 
     typename state_data_type::HostMirror h_state = create_mirror_view(state_);
     typename locks_type::HostMirror h_lock       = create_mirror_view(locks_);
@@ -902,15 +931,15 @@ class Random_XorShift64_Pool {
         gen(seed, 0);
     for (int i = 0; i < 17; i++) gen.rand();
     for (int i = 0; i < num_states_; i++) {
-      int n1     = gen.rand();
-      int n2     = gen.rand();
-      int n3     = gen.rand();
-      int n4     = gen.rand();
-      h_state(i) = (((static_cast<uint64_t>(n1)) & 0xffff) << 00) |
-                   (((static_cast<uint64_t>(n2)) & 0xffff) << 16) |
-                   (((static_cast<uint64_t>(n3)) & 0xffff) << 32) |
-                   (((static_cast<uint64_t>(n4)) & 0xffff) << 48);
-      h_lock(i) = 0;
+      int n1        = gen.rand();
+      int n2        = gen.rand();
+      int n3        = gen.rand();
+      int n4        = gen.rand();
+      h_state(i, 0) = (((static_cast<uint64_t>(n1)) & 0xffff) << 00) |
+                      (((static_cast<uint64_t>(n2)) & 0xffff) << 16) |
+                      (((static_cast<uint64_t>(n3)) & 0xffff) << 32) |
+                      (((static_cast<uint64_t>(n4)) & 0xffff) << 48);
+      h_lock(i, 0) = 0;
     }
     deep_copy(state_, h_state);
     deep_copy(locks_, h_lock);
@@ -920,19 +949,19 @@ class Random_XorShift64_Pool {
   Random_XorShift64<DeviceType> get_state() const {
     const int i =
         Impl::Random_UniqueIndex<execution_space>::get_state_idx(locks_);
-    return Random_XorShift64<DeviceType>(state_(i), i);
+    return Random_XorShift64<DeviceType>(state_(i, 0), i);
   }
 
   // NOTE: state_idx MUST be unique and less than num_states
   KOKKOS_INLINE_FUNCTION
   Random_XorShift64<DeviceType> get_state(const int state_idx) const {
-    return Random_XorShift64<DeviceType>(state_(state_idx), state_idx);
+    return Random_XorShift64<DeviceType>(state_(state_idx, 0), state_idx);
   }
 
   KOKKOS_INLINE_FUNCTION
   void free_state(const Random_XorShift64<DeviceType>& state) const {
-    state_(state.state_idx_) = state.state_;
-    locks_(state.state_idx_) = 0;
+    state_(state.state_idx_, 0) = state.state_;
+    locks_(state.state_idx_, 0) = 0;
   }
 };
 
@@ -1092,14 +1121,15 @@ template <class DeviceType = Kokkos::DefaultExecutionSpace>
 class Random_XorShift1024_Pool {
  private:
   using execution_space = typename DeviceType::execution_space;
-  using locks_type      = View<int*, execution_space>;
-  using int_view_type   = View<int*, DeviceType>;
+  using locks_type      = View<int**, execution_space>;
+  using int_view_type   = View<int**, DeviceType>;
   using state_data_type = View<uint64_t * [16], DeviceType>;
 
   locks_type locks_;
   state_data_type state_;
   int_view_type p_;
   int num_states_;
+  int padding_;
   friend class Random_XorShift1024<DeviceType>;
 
  public:
@@ -1129,15 +1159,21 @@ class Random_XorShift1024_Pool {
     state_      = src.state_;
     p_          = src.p_;
     num_states_ = src.num_states_;
+    padding_    = src.padding_;
     return *this;
   }
 
   inline void init(uint64_t seed, int num_states) {
     if (seed == 0) seed = uint64_t(1318319);
+    // I only want to pad on CPU like archs (less than 1000 threads). 64 is a
+    // magic number, or random number I just wanted something not too large and
+    // not too small. 64 sounded fine.
+    padding_    = num_states < 1000 ? 64 : 1;
     num_states_ = num_states;
-    locks_      = locks_type("Kokkos::Random_XorShift1024::locks", num_states_);
+    locks_ =
+        locks_type("Kokkos::Random_XorShift1024::locks", num_states_, padding_);
     state_ = state_data_type("Kokkos::Random_XorShift1024::state", num_states_);
-    p_     = int_view_type("Kokkos::Random_XorShift1024::p", num_states_);
+    p_ = int_view_type("Kokkos::Random_XorShift1024::p", num_states_, padding_);
 
     typename state_data_type::HostMirror h_state = create_mirror_view(state_);
     typename locks_type::HostMirror h_lock       = create_mirror_view(locks_);
@@ -1158,8 +1194,8 @@ class Random_XorShift1024_Pool {
                         (((static_cast<uint64_t>(n3)) & 0xffff) << 32) |
                         (((static_cast<uint64_t>(n4)) & 0xffff) << 48);
       }
-      h_p(i)    = 0;
-      h_lock(i) = 0;
+      h_p(i, 0)    = 0;
+      h_lock(i, 0) = 0;
     }
     deep_copy(state_, h_state);
     deep_copy(locks_, h_lock);
@@ -1169,20 +1205,20 @@ class Random_XorShift1024_Pool {
   Random_XorShift1024<DeviceType> get_state() const {
     const int i =
         Impl::Random_UniqueIndex<execution_space>::get_state_idx(locks_);
-    return Random_XorShift1024<DeviceType>(state_, p_(i), i);
+    return Random_XorShift1024<DeviceType>(state_, p_(i, 0), i);
   };
 
   // NOTE: state_idx MUST be unique and less than num_states
   KOKKOS_INLINE_FUNCTION
   Random_XorShift1024<DeviceType> get_state(const int state_idx) const {
-    return Random_XorShift1024<DeviceType>(state_, p_(state_idx), state_idx);
+    return Random_XorShift1024<DeviceType>(state_, p_(state_idx, 0), state_idx);
   }
 
   KOKKOS_INLINE_FUNCTION
   void free_state(const Random_XorShift1024<DeviceType>& state) const {
     for (int i = 0; i < 16; i++) state_(state.state_idx_, i) = state.state_[i];
-    p_(state.state_idx_)     = state.p_;
-    locks_(state.state_idx_) = 0;
+    p_(state.state_idx_, 0)     = state.p_;
+    locks_(state.state_idx_, 0) = 0;
   }
 };
 
diff --git a/packages/kokkos/algorithms/src/Kokkos_Sort.hpp b/packages/kokkos/algorithms/src/Kokkos_Sort.hpp
index d17c02776ff5653045d9259d7a4fae2207546e21..7c1ce4c4cd8e757f3018da3989660d3b4c5e4cff 100644
--- a/packages/kokkos/algorithms/src/Kokkos_Sort.hpp
+++ b/packages/kokkos/algorithms/src/Kokkos_Sort.hpp
@@ -319,7 +319,7 @@ class BinSort {
                    Kokkos::RangePolicy<execution_space>(0, len), functor);
     }
 
-    execution_space().fence();
+    execution_space().fence("Kokkos::Sort: fence after sorting");
   }
 
   template <class ValuesViewType>
@@ -492,7 +492,8 @@ bool try_std_sort(ViewType view) {
                       view.stride_3(), view.stride_4(), view.stride_5(),
                       view.stride_6(), view.stride_7()};
   possible         = possible &&
-             std::is_same<typename ViewType::memory_space, HostSpace>::value;
+             SpaceAccessibility<HostSpace,
+                                typename ViewType::memory_space>::accessible;
   possible = possible && (ViewType::Rank == 1);
   possible = possible && (stride[0] == 1);
   if (possible) {
diff --git a/packages/kokkos/algorithms/unit_tests/TestRandom.hpp b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp
index c37e779c9927b21b2add67236124f3821341d968..3dffce7df4f8dd1663a10a871075c1841005138d 100644
--- a/packages/kokkos/algorithms/unit_tests/TestRandom.hpp
+++ b/packages/kokkos/algorithms/unit_tests/TestRandom.hpp
@@ -47,7 +47,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <cstdio>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Random.hpp>
 #include <cmath>
@@ -198,11 +198,50 @@ struct test_random_functor {
           static_cast<uint64_t>(1.0 * HIST_DIM3D * tmp2 / theMax);
       const uint64_t ind3_3d =
           static_cast<uint64_t>(1.0 * HIST_DIM3D * tmp3 / theMax);
-
+// Workaround Intel 17 compiler bug which sometimes add random
+// instruction alignment which makes the lock instruction
+// illegal. Seems to be mostly just for unsigned int atomics.
+// Looking at the assembly the compiler
+// appears to insert cache line alignment for the instruction.
+// Isn't restricted to specific archs. Seen it on SNB and SKX, but for
+// different code. Another occurrence was with Desul atomics in
+// a different unit test. This one here happens without desul atomics.
+// Inserting an assembly nop instruction changes the alignment and
+// works round this.
+//
+// 17.0.4 for 64bit Random works with 1/1/1/2/1
+// 17.0.4 for 1024bit Random works with 1/1/1/1/1
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       atomic_fetch_add(&density_1d(ind1_1d), 1);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       atomic_fetch_add(&density_1d(ind2_1d), 1);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       atomic_fetch_add(&density_1d(ind3_1d), 1);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      if (std::is_same<rnd_type, Kokkos::Random_XorShift64<device_type>>::value)
+        asm volatile("nop\n");
+      asm volatile("nop\n");
+#endif
+#endif
       atomic_fetch_add(&density_3d(ind1_3d, ind2_3d, ind3_3d), 1);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
     }
     rand_pool.free_state(rand_gen);
   }
@@ -338,9 +377,11 @@ struct test_random_scalar {
       using functor_type =
           test_histogram1d_functor<typename RandomGenerator::device_type>;
       parallel_reduce(HIST_DIM1D, functor_type(density_1d, num_draws), result);
-
-      double tolerance   = 6 * std::sqrt(1.0 / HIST_DIM1D);
-      double mean_expect = 1.0 * num_draws * 3 / HIST_DIM1D;
+      double mean_eps_expect       = 0.0001;
+      double variance_eps_expect   = 0.07;
+      double covariance_eps_expect = 0.06;
+      double tolerance             = 6 * std::sqrt(1.0 / HIST_DIM1D);
+      double mean_expect           = 1.0 * num_draws * 3 / HIST_DIM1D;
       double variance_expect =
           1.0 * num_draws * 3 / HIST_DIM1D * (1.0 - 1.0 / HIST_DIM1D);
       double covariance_expect = -1.0 * num_draws * 3 / HIST_DIM1D / HIST_DIM1D;
@@ -349,11 +390,26 @@ struct test_random_scalar {
           variance_expect / (result.variance / HIST_DIM1D) - 1.0;
       double covariance_eps =
           (result.covariance / HIST_DIM1D - covariance_expect) / mean_expect;
-      pass_hist1d_mean = ((-0.0001 < mean_eps) && (0.0001 > mean_eps)) ? 1 : 0;
-      pass_hist1d_var =
-          ((-0.07 < variance_eps) && (0.07 > variance_eps)) ? 1 : 0;
-      pass_hist1d_covar =
-          ((-0.06 < covariance_eps) && (0.06 > covariance_eps)) ? 1 : 0;
+
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+      if (std::is_same<Scalar, Kokkos::Experimental::half_t>::value) {
+        mean_eps_expect       = 0.0003;
+        variance_eps_expect   = 1.0;
+        covariance_eps_expect = 5.0e4;
+      }
+#endif
+
+      pass_hist1d_mean =
+          ((-mean_eps_expect < mean_eps) && (mean_eps_expect > mean_eps)) ? 1
+                                                                          : 0;
+      pass_hist1d_var = ((-variance_eps_expect < variance_eps) &&
+                         (variance_eps_expect > variance_eps))
+                            ? 1
+                            : 0;
+      pass_hist1d_covar = ((-covariance_eps_expect < covariance_eps) &&
+                           (covariance_eps_expect > covariance_eps))
+                              ? 1
+                              : 0;
 
       cout << "Density 1D: " << mean_eps << " " << variance_eps << " "
            << (result.covariance / HIST_DIM1D / HIST_DIM1D) << " || "
@@ -371,8 +427,9 @@ struct test_random_scalar {
           test_histogram3d_functor<typename RandomGenerator::device_type>;
       parallel_reduce(HIST_DIM1D, functor_type(density_3d, num_draws), result);
 
-      double tolerance   = 6 * std::sqrt(1.0 / HIST_DIM1D);
-      double mean_expect = 1.0 * num_draws / HIST_DIM1D;
+      double variance_factor = 1.2;
+      double tolerance       = 6 * std::sqrt(1.0 / HIST_DIM1D);
+      double mean_expect     = 1.0 * num_draws / HIST_DIM1D;
       double variance_expect =
           1.0 * num_draws / HIST_DIM1D * (1.0 - 1.0 / HIST_DIM1D);
       double covariance_expect = -1.0 * num_draws / HIST_DIM1D / HIST_DIM1D;
@@ -381,15 +438,23 @@ struct test_random_scalar {
           variance_expect / (result.variance / HIST_DIM1D) - 1.0;
       double covariance_eps =
           (result.covariance / HIST_DIM1D - covariance_expect) / mean_expect;
+
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+      if (std::is_same<Scalar, Kokkos::Experimental::half_t>::value) {
+        variance_factor = 7;
+      }
+#endif
+
       pass_hist3d_mean =
           ((-tolerance < mean_eps) && (tolerance > mean_eps)) ? 1 : 0;
-      pass_hist3d_var = ((-1.2 * tolerance < variance_eps) &&
-                         (1.2 * tolerance > variance_eps))
+      pass_hist3d_var = ((-variance_factor * tolerance < variance_eps) &&
+                         (variance_factor * tolerance > variance_eps))
                             ? 1
                             : 0;
-      pass_hist3d_covar =
-          ((-tolerance < covariance_eps) && (tolerance > covariance_eps)) ? 1
-                                                                          : 0;
+      pass_hist3d_covar = ((-variance_factor * tolerance < covariance_eps) &&
+                           (variance_factor * tolerance > covariance_eps))
+                              ? 1
+                              : 0;
 
       cout << "Density 3D: " << mean_eps << " " << variance_eps << " "
            << result.covariance / HIST_DIM1D / HIST_DIM1D << " || " << tolerance
@@ -471,6 +536,21 @@ void test_random(unsigned int num_draws) {
   deep_copy(density_1d, 0);
   deep_copy(density_3d, 0);
 
+  cout << "Test Scalar=half" << endl;
+  test_random_scalar<RandomGenerator, Kokkos::Experimental::half_t> test_half(
+      density_1d, density_3d, pool, num_draws);
+  ASSERT_EQ(test_half.pass_mean, 1);
+  ASSERT_EQ(test_half.pass_var, 1);
+  ASSERT_EQ(test_half.pass_covar, 1);
+  ASSERT_EQ(test_half.pass_hist1d_mean, 1);
+  ASSERT_EQ(test_half.pass_hist1d_var, 1);
+  ASSERT_EQ(test_half.pass_hist1d_covar, 1);
+  ASSERT_EQ(test_half.pass_hist3d_mean, 1);
+  ASSERT_EQ(test_half.pass_hist3d_var, 1);
+  ASSERT_EQ(test_half.pass_hist3d_covar, 1);
+  deep_copy(density_1d, 0);
+  deep_copy(density_3d, 0);
+
   cout << "Test Scalar=float" << endl;
   test_random_scalar<RandomGenerator, float> test_float(density_1d, density_3d,
                                                         pool, num_draws);
diff --git a/packages/kokkos/appveyor.yml b/packages/kokkos/appveyor.yml
index e8763c0b665c4a992f74b70eab0caa915beb33dd..73a0d3187596be4d4c99ef9f211b93bd0659079e 100644
--- a/packages/kokkos/appveyor.yml
+++ b/packages/kokkos/appveyor.yml
@@ -3,4 +3,8 @@ image:
 clone_folder: c:\projects\source
 build_script:
 - cmd: >-
-    cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc /d1reportClassLayoutChanges" -DCTEST_ARGS="-C Debug -V --output-on-failure" -DBUILD_NAME=MSVC-2019 -DBUILD_TYPE=Debug -DSITE=AppVeyor -DTARGET=install -P cmake/KokkosCI.cmake
+    mkdir build &&
+    cd build &&
+    cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc" -DKokkos_ENABLE_DEPRECATED_CODE_3=ON -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF &&
+    cmake --build . --target install &&
+    ctest -C Debug --output-on-failure
diff --git a/packages/kokkos/benchmarks/atomic/main.cpp b/packages/kokkos/benchmarks/atomic/main.cpp
index 7b5caa1aee1658104a8916bec314759e3e5ba30a..cc0d3e41e85aaa7483d11dbf639cc5a9d5809a47 100644
--- a/packages/kokkos/benchmarks/atomic/main.cpp
+++ b/packages/kokkos/benchmarks/atomic/main.cpp
@@ -1,12 +1,12 @@
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <Kokkos_Random.hpp>
 
 template <class Scalar>
 double test_atomic(int L, int N, int M, int K, int R,
                    Kokkos::View<const int*> offsets) {
   Kokkos::View<Scalar*> output("Output", N);
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
 
   for (int r = 0; r < R; r++)
     Kokkos::parallel_for(
@@ -28,7 +28,7 @@ template <class Scalar>
 double test_no_atomic(int L, int N, int M, int K, int R,
                       Kokkos::View<const int*> offsets) {
   Kokkos::View<Scalar*> output("Output", N);
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
   for (int r = 0; r < R; r++)
     Kokkos::parallel_for(
         L, KOKKOS_LAMBDA(const int& i) {
diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp b/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp
index 62d7ef4a4cf387191c6d0276c4ea360c289d4de5..4fc6ca2c68b3a77e37360b90b678ae5c461204f6 100644
--- a/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp
+++ b/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 template <class Scalar, int Unroll, int Stride>
 struct Run {
diff --git a/packages/kokkos/benchmarks/bytes_and_flops/main.cpp b/packages/kokkos/benchmarks/bytes_and_flops/main.cpp
index 6da2407a08b7afb981ffb8c3a970b2df7d55f951..75f30a340938378fe85f72b4dd294235594cf21d 100644
--- a/packages/kokkos/benchmarks/bytes_and_flops/main.cpp
+++ b/packages/kokkos/benchmarks/bytes_and_flops/main.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <bench.hpp>
 #include <cstdlib>
 
diff --git a/packages/kokkos/benchmarks/gather/main.cpp b/packages/kokkos/benchmarks/gather/main.cpp
index 5f10e4dcc1aa509c191d3c7a6486114b3c0b7de9..dd502faaa480c1c7ab9936e4f032095094e714bb 100644
--- a/packages/kokkos/benchmarks/gather/main.cpp
+++ b/packages/kokkos/benchmarks/gather/main.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <gather.hpp>
 #include <cstdlib>
 
diff --git a/packages/kokkos/benchmarks/stream/stream-kokkos.cpp b/packages/kokkos/benchmarks/stream/stream-kokkos.cpp
index e7ef67e0805c9c4424fbf1781423cf907dca3eec..311947c197cba64e8354dc63743baa99b0bfa782 100644
--- a/packages/kokkos/benchmarks/stream/stream-kokkos.cpp
+++ b/packages/kokkos/benchmarks/stream/stream-kokkos.cpp
@@ -52,35 +52,33 @@
 
 #define HLINE "-------------------------------------------------------------\n"
 
-#if defined(KOKKOS_ENABLE_CUDA)
-using StreamHostArray   = Kokkos::View<double*, Kokkos::CudaSpace>::HostMirror;
-using StreamDeviceArray = Kokkos::View<double*, Kokkos::CudaSpace>;
-#else
-using StreamHostArray   = Kokkos::View<double*, Kokkos::HostSpace>::HostMirror;
-using StreamDeviceArray = Kokkos::View<double*, Kokkos::HostSpace>;
-#endif
+using StreamDeviceArray =
+    Kokkos::View<double*, Kokkos::MemoryTraits<Kokkos::Restrict>>;
+using StreamHostArray = typename StreamDeviceArray::HostMirror;
 
 using StreamIndex = int;
+using Policy      = Kokkos::RangePolicy<Kokkos::IndexType<StreamIndex>>;
 
-double now() {
-  struct timeval now;
-  gettimeofday(&now, nullptr);
+void perform_set(StreamDeviceArray& a, const double scalar) {
+  Kokkos::parallel_for(
+      "set", Policy(0, a.extent(0)),
+      KOKKOS_LAMBDA(const StreamIndex i) { a[i] = scalar; });
 
-  return (double)now.tv_sec + ((double)now.tv_usec * 1.0e-6);
+  Kokkos::fence();
 }
 
-void perform_copy(StreamDeviceArray& a, StreamDeviceArray& b,
-                  StreamDeviceArray& c) {
+void perform_copy(StreamDeviceArray& a, StreamDeviceArray& b) {
   Kokkos::parallel_for(
-      "copy", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) { c[i] = a[i]; });
+      "copy", Policy(0, a.extent(0)),
+      KOKKOS_LAMBDA(const StreamIndex i) { b[i] = a[i]; });
 
   Kokkos::fence();
 }
 
-void perform_scale(StreamDeviceArray& a, StreamDeviceArray& b,
-                   StreamDeviceArray& c, const double scalar) {
+void perform_scale(StreamDeviceArray& b, StreamDeviceArray& c,
+                   const double scalar) {
   Kokkos::parallel_for(
-      "copy", a.extent(0),
+      "scale", Policy(0, b.extent(0)),
       KOKKOS_LAMBDA(const StreamIndex i) { b[i] = scalar * c[i]; });
 
   Kokkos::fence();
@@ -89,7 +87,7 @@ void perform_scale(StreamDeviceArray& a, StreamDeviceArray& b,
 void perform_add(StreamDeviceArray& a, StreamDeviceArray& b,
                  StreamDeviceArray& c) {
   Kokkos::parallel_for(
-      "add", a.extent(0),
+      "add", Policy(0, a.extent(0)),
       KOKKOS_LAMBDA(const StreamIndex i) { c[i] = a[i] + b[i]; });
 
   Kokkos::fence();
@@ -98,7 +96,7 @@ void perform_add(StreamDeviceArray& a, StreamDeviceArray& b,
 void perform_triad(StreamDeviceArray& a, StreamDeviceArray& b,
                    StreamDeviceArray& c, const double scalar) {
   Kokkos::parallel_for(
-      "triad", a.extent(0),
+      "triad", Policy(0, a.extent(0)),
       KOKKOS_LAMBDA(const StreamIndex i) { a[i] = b[i] + scalar * c[i]; });
 
   Kokkos::fence();
@@ -184,6 +182,7 @@ int run_benchmark() {
 
   const double scalar = 3.0;
 
+  double setTime   = std::numeric_limits<double>::max();
   double copyTime  = std::numeric_limits<double>::max();
   double scaleTime = std::numeric_limits<double>::max();
   double addTime   = std::numeric_limits<double>::max();
@@ -191,13 +190,10 @@ int run_benchmark() {
 
   printf("Initializing Views...\n");
 
-#if defined(KOKKOS_HAVE_OPENMP)
-  Kokkos::parallel_for(
-      "init", Kokkos::RangePolicy<Kokkos::OpenMP>(0, STREAM_ARRAY_SIZE),
-#else
   Kokkos::parallel_for(
-      "init", Kokkos::RangePolicy<Kokkos::Serial>(0, STREAM_ARRAY_SIZE),
-#endif
+      "init",
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0,
+                                                             STREAM_ARRAY_SIZE),
       KOKKOS_LAMBDA(const int i) {
         a[i] = 1.0;
         b[i] = 2.0;
@@ -209,26 +205,30 @@ int run_benchmark() {
   Kokkos::deep_copy(dev_b, b);
   Kokkos::deep_copy(dev_c, c);
 
-  double start;
-
   printf("Starting benchmarking...\n");
 
+  Kokkos::Timer timer;
+
   for (StreamIndex k = 0; k < STREAM_NTIMES; ++k) {
-    start = now();
-    perform_copy(dev_a, dev_b, dev_c);
-    copyTime = std::min(copyTime, (now() - start));
+    timer.reset();
+    perform_set(dev_c, 1.5);
+    setTime = std::min(setTime, timer.seconds());
+
+    timer.reset();
+    perform_copy(dev_a, dev_c);
+    copyTime = std::min(copyTime, timer.seconds());
 
-    start = now();
-    perform_scale(dev_a, dev_b, dev_c, scalar);
-    scaleTime = std::min(scaleTime, (now() - start));
+    timer.reset();
+    perform_scale(dev_b, dev_c, scalar);
+    scaleTime = std::min(scaleTime, timer.seconds());
 
-    start = now();
+    timer.reset();
     perform_add(dev_a, dev_b, dev_c);
-    addTime = std::min(addTime, (now() - start));
+    addTime = std::min(addTime, timer.seconds());
 
-    start = now();
+    timer.reset();
     perform_triad(dev_a, dev_b, dev_c, scalar);
-    triadTime = std::min(triadTime, (now() - start));
+    triadTime = std::min(triadTime, timer.seconds());
   }
 
   Kokkos::deep_copy(a, dev_a);
@@ -240,6 +240,9 @@ int run_benchmark() {
 
   printf(HLINE);
 
+  printf("Set             %11.2f MB/s\n",
+         (1.0e-06 * 1.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) /
+             setTime);
   printf("Copy            %11.2f MB/s\n",
          (1.0e-06 * 2.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) /
              copyTime);
diff --git a/packages/kokkos/bin/hpcbind b/packages/kokkos/bin/hpcbind
index 6af091a7d8b60766cddae67c6076b5df1f8ad12f..43f8a745da27c080ce54ac4cfd9b9358f618554f 100755
--- a/packages/kokkos/bin/hpcbind
+++ b/packages/kokkos/bin/hpcbind
@@ -634,15 +634,15 @@ elif [[ ${HPCBIND_HAS_COMMAND} -eq 1 ]]; then
   > ${HPCBIND_OUT}
   if [[ ${HPCBIND_TEE} -eq 0 ]]; then
     if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
-      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
+      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- "$@" > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
     else
-      eval $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
+      eval "$@" > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
     fi
   else
     if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
-      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
+      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- "$@" > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
     else
-      eval $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
+      eval "$@" > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
     fi
   fi
 fi
diff --git a/packages/kokkos/bin/nvcc_wrapper b/packages/kokkos/bin/nvcc_wrapper
index 4e52e4d09f4f86668ccd322b8ab2fe1093f31996..ba2c55508aca0c139853ce7107c53f8a405ec9c3 100755
--- a/packages/kokkos/bin/nvcc_wrapper
+++ b/packages/kokkos/bin/nvcc_wrapper
@@ -96,7 +96,7 @@ replace_pragma_ident=0
 first_xcompiler_arg=1
 
 # Allow for setting temp dir without setting TMPDIR in parent (see https://docs.olcf.ornl.gov/systems/summit_user_guide.html#setting-tmpdir-causes-jsm-jsrun-errors-job-state-flip-flop)
-if [[ ! -z ${NVCC_WRAPPER_TMPDIR+x} ]]; then
+if [[ -z ${NVCC_WRAPPER_TMPDIR+x} ]]; then
   temp_dir=${TMPDIR:-/tmp}
 else
   temp_dir=${NVCC_WRAPPER_TMPDIR+x}
@@ -226,14 +226,14 @@ do
     cuda_args="$cuda_args $1"
     ;;
   #Handle more known nvcc args
-  --expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets)
+  --expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets|-allow-unsupported-compiler|--allow-unsupported-compiler)
     cuda_args="$cuda_args $1"
     ;;
   #Handle known nvcc args that have an argument
-  -maxrregcount=*|--maxrregcount=*)
+  -maxrregcount=*|--maxrregcount=*|-time=*)
     cuda_args="$cuda_args $1"
     ;;
-  -maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart|-include)
+  -maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart|-include|-time)
     cuda_args="$cuda_args $1 $2"
     shift
     ;;
@@ -552,14 +552,14 @@ if [ $host_only -eq 1 ]; then
   $host_command
 elif [ -n "$nvcc_depfile_command" ]; then
   if [ "$NVCC_WRAPPER_SHOW_COMMANDS_BEING_RUN" == "1" ] ; then
-    echo "$nvcc_command && $nvcc_depfile_command"
+    echo "TMPDIR=${temp_dir} $nvcc_command && TMPDIR=${temp_dir} $nvcc_depfile_command"
   fi
-  $nvcc_command && $nvcc_depfile_command
+  TMPDIR=${temp_dir} $nvcc_command && TMPDIR=${temp_dir} $nvcc_depfile_command
 else
   if [ "$NVCC_WRAPPER_SHOW_COMMANDS_BEING_RUN" == "1" ] ; then
-    echo "$nvcc_command"
+    echo "TMPDIR=${temp_dir} $nvcc_command"
   fi
-  $nvcc_command
+  TMPDIR=${temp_dir} $nvcc_command
 fi
 error_code=$?
 
diff --git a/packages/kokkos/cmake/CTestConfig.cmake.in b/packages/kokkos/cmake/CTestConfig.cmake.in
deleted file mode 100644
index 1f82c0d64d15e0a4fb346cfb7227be9cd41e5f17..0000000000000000000000000000000000000000
--- a/packages/kokkos/cmake/CTestConfig.cmake.in
+++ /dev/null
@@ -1,91 +0,0 @@
-#----------------------------------------------------------------------------------------#
-#
-#   CTestConfig.cmake template for Kokkos
-#
-#----------------------------------------------------------------------------------------#
-
-#
-#   dash-board related
-#
-set(CTEST_PROJECT_NAME "Kokkos")
-set(CTEST_NIGHTLY_START_TIME "01:00:00 UTC")
-set(CTEST_DROP_METHOD "https")
-set(CTEST_DROP_SITE "cdash.nersc.gov")
-set(CTEST_DROP_LOCATION "/submit.php?project=${CTEST_PROJECT_NAME}")
-set(CTEST_CDASH_VERSION "1.6")
-set(CTEST_CDASH_QUERY_VERSION TRUE)
-set(CTEST_SUBMIT_RETRY_COUNT "1")
-set(CTEST_SUBMIT_RETRY_DELAY "30")
-
-#
-#   configure/build related
-#
-set(CTEST_BUILD_NAME "@BUILD_NAME@")
-set(CTEST_MODEL "@MODEL@")
-set(CTEST_SITE "@SITE@")
-set(CTEST_CONFIGURATION_TYPE "@BUILD_TYPE@")
-set(CTEST_SOURCE_DIRECTORY "@SOURCE_REALDIR@")
-set(CTEST_BINARY_DIRECTORY "@BINARY_REALDIR@")
-
-#
-#   configure/build related
-#
-set(CTEST_UPDATE_TYPE "git")
-set(CTEST_UPDATE_VERSION_ONLY ON)
-# set(CTEST_GENERATOR "")
-# set(CTEST_GENERATOR_PLATFORM "")
-
-#
-#   testing related
-#
-set(CTEST_TIMEOUT "7200")
-set(CTEST_TEST_TIMEOUT "7200")
-set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS "100")
-set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "100")
-set(CTEST_CUSTOM_MAXIMUM_PASSED_TEST_OUTPUT_SIZE "1048576")
-
-#
-#   coverage related
-#
-set(CTEST_CUSTOM_COVERAGE_EXCLUDE ".*tpls/.*;/usr/.*;.*unit_test/.*;.*unit_tests/.*;.*perf_test/.*")
-
-#
-#   commands
-#
-if(NOT "@CHECKOUT_COMMAND@" STREQUAL "")
-    set(CTEST_CHECKOUT_COMMAND "@CHECKOUT_COMMAND@")
-endif()
-set(CTEST_UPDATE_COMMAND "@GIT_EXECUTABLE@")
-set(CTEST_CONFIGURE_COMMAND "@CMAKE_COMMAND@ -DCMAKE_BUILD_TYPE=@BUILD_TYPE@ -DKokkos_ENABLE_TESTS=ON @CONFIG_ARGS@ @SOURCE_REALDIR@")
-set(CTEST_BUILD_COMMAND "@CMAKE_COMMAND@ --build @BINARY_REALDIR@ --target @TARGET@")
-if(NOT WIN32)
-    set(CTEST_BUILD_COMMAND "${CTEST_BUILD_COMMAND} -- -j@BUILD_JOBS@")
-endif()
-set(CTEST_COVERAGE_COMMAND "gcov")
-set(CTEST_MEMORYCHECK_COMMAND "valgrind")
-set(CTEST_GIT_COMMAND "@GIT_EXECUTABLE@")
-
-#
-#   various configs
-#
-set(APPEND_VALUE @APPEND@)
-if(APPEND_VALUE)
-    set(APPEND_CTEST APPEND)
-endif()
-
-macro(SET_TEST_PROP VAR)
-    if(NOT "${ARGS}" STREQUAL "")
-        set(${VAR}_CTEST ${VAR} ${ARGN})
-    endif()
-endmacro()
-
-set_test_prop(START           @START@)
-set_test_prop(END             @END@)
-set_test_prop(STRIDE          @STRIDE@)
-set_test_prop(INCLUDE         @INCLUDE@)
-set_test_prop(EXCLUDE         @EXCLUDE@)
-set_test_prop(INCLUDE_LABEL   @INCLUDE_LABEL@)
-set_test_prop(EXCLUDE_LABEL   @EXCLUDE_LABEL@)
-set_test_prop(PARALLEL_LEVEL  @PARALLEL_LEVEL@)
-set_test_prop(STOP_TIME       @STOP_TIME@)
-set_test_prop(COVERAGE_LABELS @LABELS@)
diff --git a/packages/kokkos/cmake/KokkosCI.cmake b/packages/kokkos/cmake/KokkosCI.cmake
deleted file mode 100644
index e8c9af37ad544a93a62f498e9a903696321a1c75..0000000000000000000000000000000000000000
--- a/packages/kokkos/cmake/KokkosCI.cmake
+++ /dev/null
@@ -1,350 +0,0 @@
-cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
-
-message(STATUS "")
-
-get_cmake_property(_cached_vars CACHE_VARIABLES)
-set(KOKKOS_CMAKE_ARGS)
-set(EXCLUDED_VARIABLES "CMAKE_COMMAND" "CMAKE_CPACK_COMMAND" "CMAKE_CTEST_COMMAND" "CMAKE_ROOT"
-                       "CTEST_ARGS" "BUILD_NAME" "CMAKE_CXX_FLAGS" "CMAKE_BUILD_TYPE")
-list(SORT _cached_vars)
-foreach(_var ${_cached_vars})
-    if(NOT "${_var}" IN_LIST EXCLUDED_VARIABLES)
-        list(APPEND KOKKOS_CMAKE_ARGS ${_var})
-        if("${_var}" STREQUAL "CMAKE_BUILD_TYPE")
-            set(BUILD_TYPE "${CMAKE_BUILD_TYPE}")
-        endif()
-    endif()
-endforeach()
-
-
-#----------------------------------------------------------------------------------------#
-#
-#   Macros and variables
-#
-#----------------------------------------------------------------------------------------#
-
-macro(CHECK_REQUIRED VAR)
-    if(NOT DEFINED ${VAR})
-        message(FATAL_ERROR "Error! Variable '${VAR}' must be defined")
-    endif()
-endmacro()
-
-# require the build name variable
-CHECK_REQUIRED(BUILD_NAME)
-
-# uses all args
-macro(SET_DEFAULT VAR)
-    if(NOT DEFINED ${VAR})
-        set(${VAR} ${ARGN})
-    endif()
-    # remove these ctest configuration variables from the defines
-    # passed to the Kokkos configuration
-    if("${VAR}" IN_LIST KOKKOS_CMAKE_ARGS)
-        list(REMOVE_ITEM KOKKOS_CMAKE_ARGS "${VAR}")
-    endif()
-endmacro()
-
-# uses first arg -- useful for selecting via priority from multiple
-# potentially defined variables, e.g.:
-#
-#   set_default_arg1(BUILD_NAME ${TRAVIS_BUILD_NAME} ${BUILD_NAME})
-#
-macro(SET_DEFAULT_ARG1 VAR)
-    if(NOT DEFINED ${VAR})
-        foreach(_ARG ${ARGN})
-            if(NOT "${_ARG}" STREQUAL "")
-                set(${VAR} ${_ARG})
-                break()
-            endif()
-        endforeach()
-    endif()
-    # remove these ctest configuration variables from the defines
-    # passed to the Kokkos configuration
-    if("${VAR}" IN_LIST KOKKOS_CMAKE_ARGS)
-        list(REMOVE_ITEM KOKKOS_CMAKE_ARGS "${VAR}")
-    endif()
-endmacro()
-
-# determine the default working directory
-if(NOT "$ENV{WORKSPACE}" STREQUAL "")
-    set(WORKING_DIR "$ENV{WORKSPACE}")
-else()
-    get_filename_component(WORKING_DIR ${CMAKE_CURRENT_LIST_DIR} DIRECTORY)
-endif()
-
-# determine the hostname
-execute_process(COMMAND hostname
-    OUTPUT_VARIABLE HOSTNAME
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-SET_DEFAULT(HOSTNAME "$ENV{HOSTNAME}")
-
-# get the number of processors
-include(ProcessorCount)
-ProcessorCount(NUM_PROCESSORS)
-
-# find git
-find_package(Git QUIET)
-if(NOT GIT_EXECUTABLE)
-    unset(GIT_EXECUTABLE CACHE)
-    unset(GIT_EXECUTABLE)
-endif()
-
-function(EXECUTE_GIT_COMMAND VAR)
-    set(${VAR} "" PARENT_SCOPE)
-    execute_process(COMMAND ${GIT_EXECUTABLE} ${ARGN}
-        OUTPUT_VARIABLE VAL
-        RESULT_VARIABLE RET
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-        WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
-        ERROR_QUIET)
-    string(REPLACE ";" " " _CMD "${GIT_EXECUTABLE} ${ARGN}")
-    set(LAST_GIT_COMMAND "${_CMD}" PARENT_SCOPE)
-    if(RET EQUAL 0)
-        set(${VAR} "${VAL}" PARENT_SCOPE)
-    endif()
-endfunction()
-
-# just gets the git branch name if available
-function(GET_GIT_BRANCH_NAME VAR)
-    execute_git_command(GIT_BRANCH branch --show-current)
-    set(_INVALID "%D" "HEAD")
-    if(NOT GIT_BRANCH OR "${GIT_BRANCH}" IN_LIST _INVALID)
-        execute_git_command(GIT_BRANCH show -s --format=%D)
-        if(NOT GIT_BRANCH OR "${GIT_BRANCH}" IN_LIST _INVALID)
-            execute_git_command(GIT_BRANCH --describe all)
-        endif()
-    endif()
-    #
-    if(GIT_BRANCH)
-        string(REPLACE " " ";" _DESC "${GIT_BRANCH}")
-        # just set it to last one via loop instead of wonky cmake index manip
-        foreach(_ITR ${_DESC})
-            set(GIT_BRANCH "${_ITR}")
-        endforeach()
-        set(${VAR} "${GIT_BRANCH}" PARENT_SCOPE)
-        message(STATUS "GIT BRANCH via '${LAST_GIT_COMMAND}': ${GIT_BRANCH}")
-    endif()
-endfunction()
-
-# just gets the git branch name if available
-function(GET_GIT_AUTHOR_NAME VAR)
-    execute_git_command(GIT_AUTHOR show -s --format=%an)
-    if(GIT_AUTHOR)
-        string(LENGTH "${GIT_AUTHOR}" STRLEN)
-        # if the build name gets too long, this can cause submission errors
-        if(STRLEN GREATER 24)
-            # remove middle initial
-            string(REGEX REPLACE " [A-Z]\. " " " GIT_AUTHOR "${GIT_AUTHOR}")
-            # get first and sur name
-            string(REGEX REPLACE "([A-Za-z]+) ([A-Za-z]+)" "\\1" F_NAME "${GIT_AUTHOR}")
-            string(REGEX REPLACE "([A-Za-z]+) ([A-Za-z]+)" "\\2" S_NAME "${GIT_AUTHOR}")
-            if(S_NAME)
-                set(GIT_AUTHOR "${S_NAME}")
-            elseif(F_NAME)
-                set(GIT_AUTHOR "${F_NAME}")
-            endif()
-        endif()
-        # remove any spaces, quotes, periods, etc.
-        string(REGEX REPLACE "[ ',;_\.\"]+" "" GIT_AUTHOR "${GIT_AUTHOR}")
-        set(${VAR} "${GIT_AUTHOR}" PARENT_SCOPE)
-        message(STATUS "GIT AUTHOR via '${LAST_GIT_COMMAND}': ${GIT_AUTHOR}")
-    endif()
-endfunction()
-
-# get the name of the branch
-GET_GIT_BRANCH_NAME(GIT_BRANCH)
-# get the name of the author
-GET_GIT_AUTHOR_NAME(GIT_AUTHOR)
-# author, prefer git method for consistency
-SET_DEFAULT_ARG1(AUTHOR ${GIT_AUTHOR} $ENV{GIT_AUTHOR} $ENV{AUTHOR})
-# SLUG == owner_name/repo_name
-SET_DEFAULT_ARG1(SLUG $ENV{TRAVIS_PULL_REQUEST_SLUG} $ENV{TRAVIS_REPO_SLUG} $ENV{APPVEYOR_REPO_NAME} $ENV{PULL_REQUEST_SLUG} $ENV{REPO_SLUG})
-# branch name
-SET_DEFAULT_ARG1(BRANCH $ENV{TRAVIS_PULL_REQUEST_BRANCH} $ENV{TRAVIS_BRANCH} $ENV{APPVEYOR_PULL_REQUEST_HEAD_REPO_BRANCH} $ENV{APPVEYOR_REPO_BRANCH} $ENV{GIT_BRANCH} $ENV{BRANCH_NAME} $ENV{BRANCH} ${GIT_BRANCH})
-# pull request number
-SET_DEFAULT_ARG1(PULL_REQUEST_NUM $ENV{TRAVIS_PULL_REQUEST} $ENV{CHANGE_ID} $ENV{APPVEYOR_PULL_REQUEST_NUMBER} $ENV{PULL_REQUEST_NUM})
-# get the event type, e.g. push, pull_request, api, cron, etc.
-SET_DEFAULT_ARG1(EVENT_TYPE $ENV{TRAVIS_EVENT_TYPE} ${EVENT_TYPE})
-
-if("${BRANCH}" STREQUAL "")
-    message(STATUS "Checked: environment variables for Travis, Appveyor, Jenkins (git plugin), BRANCH_NAME, BRANCH and 'git branch --show-current'")
-    message(FATAL_ERROR "Error! Git branch could not be determined. Please provide -DBRANCH=<name>")
-endif()
-
-#----------------------------------------------------------------------------------------#
-#
-#   Set default values if not provided on command-line
-#
-#----------------------------------------------------------------------------------------#
-
-SET_DEFAULT(SOURCE_DIR      "${WORKING_DIR}")           # source directory
-SET_DEFAULT(BINARY_DIR      "${WORKING_DIR}/build")     # build directory
-SET_DEFAULT(BUILD_TYPE      "${CMAKE_BUILD_TYPE}")      # Release, Debug, etc.
-SET_DEFAULT(MODEL           "Continuous")               # Continuous, Nightly, or Experimental
-SET_DEFAULT(JOBS            1)                          # number of parallel ctests
-SET_DEFAULT(CTEST_COMMAND   "${CMAKE_CTEST_COMMAND}")   # just in case
-SET_DEFAULT(CTEST_ARGS      "-V --output-on-failure")   # extra arguments when ctest is called
-SET_DEFAULT(GIT_EXECUTABLE  "git")                      # ctest_update
-SET_DEFAULT(TARGET          "all")                      # build target
-SET_DEFAULT_ARG1(SITE       "$ENV{SITE}"
-                            "${HOSTNAME}")              # update site
-SET_DEFAULT_ARG1(BUILD_JOBS "$ENV{BUILD_JOBS}"
-                            "${NUM_PROCESSORS}")        # number of parallel compile jobs
-#
-#   The variable below correspond to ctest arguments, i.e. START,END,STRIDE are
-#   '-I START,END,STRIDE'
-#
-SET_DEFAULT(START           "")
-SET_DEFAULT(END             "")
-SET_DEFAULT(STRIDE          "")
-SET_DEFAULT(INCLUDE         "")
-SET_DEFAULT(EXCLUDE         "")
-SET_DEFAULT(INCLUDE_LABEL   "")
-SET_DEFAULT(EXCLUDE_LABEL   "")
-SET_DEFAULT(PARALLEL_LEVEL  "")
-SET_DEFAULT(STOP_TIME       "")
-SET_DEFAULT(LABELS          "")
-SET_DEFAULT(NOTES           "")
-
-# default static build tag for Nightly
-set(BUILD_TAG "${BRANCH}")
-
-if(NOT BUILD_TYPE)
-    # default for kokkos if not specified
-    set(BUILD_TYPE "RelWithDebInfo")
-endif()
-
-# generate dynamic name if continuous or experimental model
-if(NOT "${MODEL}" STREQUAL "Nightly")
-    if(EVENT_TYPE AND PULL_REQUEST_NUM)
-        # e.g. pull_request/123
-        if(AUTHOR)
-            set(BUILD_TAG "${AUTHOR}/${EVENT_TYPE}/${PULL_REQUEST_NUM}")
-        else()
-            set(BUILD_TAG "${EVENT_TYPE}/${PULL_REQUEST_NUM}")
-        endif()
-    elseif(SLUG)
-        # e.g. owner_name/repo_name
-        set(BUILD_TAG "${SLUG}")
-    elseif(AUTHOR)
-        set(BUILD_TAG "${AUTHOR}/${BRANCH}")
-    endif()
-    if(EVENT_TYPE AND NOT PULL_REQUEST_NUM)
-        set(BUILD_TAG "${BUILD_TAG}-${EVENT_TYPE}")
-    endif()
-endif()
-
-# unnecessary
-string(REPLACE "/remotes/" "/" BUILD_TAG "${BUILD_TAG}")
-string(REPLACE "/origin/" "/" BUILD_TAG "${BUILD_TAG}")
-
-message(STATUS "BUILD_TAG: ${BUILD_TAG}")
-
-set(BUILD_NAME "[${BUILD_TAG}] [${BUILD_NAME}-${BUILD_TYPE}]")
-
-# colons in build name create extra (empty) entries in CDash
-string(REPLACE ":" "-" BUILD_NAME "${BUILD_NAME}")
-# unnecessary info
-string(REPLACE "/merge]" "]" BUILD_NAME "${BUILD_NAME}")
-# consistency
-string(REPLACE "/pr/" "/pull/" BUILD_NAME "${BUILD_NAME}")
-string(REPLACE "pull_request/" "pull/" BUILD_NAME "${BUILD_NAME}")
-# miscellaneous from missing fields
-string(REPLACE "--" "-" BUILD_NAME "${BUILD_NAME}")
-string(REPLACE "-]" "]" BUILD_NAME "${BUILD_NAME}")
-
-# check binary directory
-if(EXISTS ${BINARY_DIR})
-    if(NOT IS_DIRECTORY "${BINARY_DIR}")
-        message(FATAL_ERROR "Error! '${BINARY_DIR}' already exists and is not a directory!")
-    endif()
-    file(GLOB BINARY_DIR_FILES "${BINARY_DIR}/*")
-    if(NOT "${BINARY_DIR_FILES}" STREQUAL "")
-        message(FATAL_ERROR "Error! '${BINARY_DIR}' already exists and is not empty!")
-    endif()
-endif()
-
-get_filename_component(SOURCE_REALDIR ${SOURCE_DIR} REALPATH)
-get_filename_component(BINARY_REALDIR ${BINARY_DIR} REALPATH)
-
-#----------------------------------------------------------------------------------------#
-#
-#   Generate the CTestConfig.cmake
-#
-#----------------------------------------------------------------------------------------#
-
-set(CONFIG_ARGS)
-foreach(_ARG ${KOKKOS_CMAKE_ARGS})
-    if(NOT "${${_ARG}}" STREQUAL "")
-        get_property(_ARG_TYPE CACHE ${_ARG} PROPERTY TYPE)
-        if("${_ARG_TYPE}" STREQUAL "UNINITIALIZED")
-            if("${${_ARG}}" STREQUAL "ON" OR "${${_ARG}}" STREQUAL "OFF")
-                set(_ARG_TYPE "BOOL")
-            elseif(EXISTS "${${_ARG}}" AND NOT IS_DIRECTORY "${${_ARG}}")
-                set(_ARG_TYPE "FILEPATH")
-            elseif(EXISTS "${${_ARG}}" AND IS_DIRECTORY "${${_ARG}}")
-                set(_ARG_TYPE "PATH")
-            elseif(NOT "${${_ARG}}" STREQUAL "")
-                set(_ARG_TYPE "STRING")
-            endif()
-        endif()
-        set(CONFIG_ARGS "${CONFIG_ARGS}set(${_ARG} \"${${_ARG}}\" CACHE ${_ARG_TYPE} \"\")\n")
-    endif()
-endforeach()
-
-file(WRITE ${BINARY_REALDIR}/initial-cache.cmake
-"
-set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS}\" CACHE STRING \"\")
-${CONFIG_ARGS}
-")
-
-file(READ ${BINARY_REALDIR}/initial-cache.cmake _CACHE_INFO)
-message(STATUS "Initial cache:\n${_CACHE_INFO}")
-
-# initialize the cache
-set(CONFIG_ARGS "-C ${BINARY_REALDIR}/initial-cache.cmake")
-
-
-# generate the CTestConfig.cmake
-configure_file(
-    ${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake.in
-    ${BINARY_REALDIR}/CTestConfig.cmake
-    @ONLY)
-
-# copy/generate the dashboard script
-configure_file(
-    ${CMAKE_CURRENT_LIST_DIR}/KokkosCTest.cmake.in
-    ${BINARY_REALDIR}/KokkosCTest.cmake
-    @ONLY)
-
-# custom CTest settings go in ${BINARY_DIR}/CTestCustom.cmake
-execute_process(
-    COMMAND             ${CMAKE_COMMAND} -E touch CTestCustom.cmake
-    WORKING_DIRECTORY   ${BINARY_REALDIR}
-    )
-
-#----------------------------------------------------------------------------------------#
-#
-#   Execute CTest
-#
-#----------------------------------------------------------------------------------------#
-
-message(STATUS "")
-message(STATUS "BUILD_NAME: ${BUILD_NAME}")
-message(STATUS "Executing '${CTEST_COMMAND} -S KokkosCTest.cmake ${CTEST_ARGS}'...")
-message(STATUS "")
-
-# e.g. -DCTEST_ARGS="--output-on-failure -VV" should really be -DCTEST_ARGS="--output-on-failure;-VV"
-string(REPLACE " " ";" CTEST_ARGS "${CTEST_ARGS}")
-
-execute_process(
-    COMMAND             ${CTEST_COMMAND} -S KokkosCTest.cmake ${CTEST_ARGS}
-    RESULT_VARIABLE     RET
-    WORKING_DIRECTORY   ${BINARY_REALDIR}
-    )
-
-# ensure that any non-zero result variable gets propagated
-if(NOT RET EQUAL 0)
-    message(FATAL_ERROR "CTest return non-zero exit code: ${RET}")
-endif()
diff --git a/packages/kokkos/cmake/KokkosCTest.cmake.in b/packages/kokkos/cmake/KokkosCTest.cmake.in
deleted file mode 100644
index b6917f3cc1897aa6b1f0876560bb08c0c87b4c3a..0000000000000000000000000000000000000000
--- a/packages/kokkos/cmake/KokkosCTest.cmake.in
+++ /dev/null
@@ -1,261 +0,0 @@
-cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
-
-if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake")
-    include("${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake")
-endif()
-
-include(ProcessorCount)
-ProcessorCount(CTEST_PROCESSOR_COUNT)
-
-cmake_policy(SET CMP0009 NEW)
-cmake_policy(SET CMP0011 NEW)
-
-# ---------------------------------------------------------------------------- #
-# -- Commands
-# ---------------------------------------------------------------------------- #
-find_program(CTEST_CMAKE_COMMAND    NAMES cmake)
-find_program(CTEST_UNAME_COMMAND    NAMES uname)
-
-find_program(CTEST_BZR_COMMAND      NAMES bzr)
-find_program(CTEST_CVS_COMMAND      NAMES cvs)
-find_program(CTEST_GIT_COMMAND      NAMES git)
-find_program(CTEST_HG_COMMAND       NAMES hg)
-find_program(CTEST_P4_COMMAND       NAMES p4)
-find_program(CTEST_SVN_COMMAND      NAMES svn)
-
-find_program(VALGRIND_COMMAND       NAMES valgrind)
-find_program(GCOV_COMMAND           NAMES gcov)
-find_program(LCOV_COMMAND           NAMES llvm-cov)
-find_program(MEMORYCHECK_COMMAND    NAMES valgrind )
-
-set(MEMORYCHECK_TYPE Valgrind)
-# set(MEMORYCHECK_TYPE Purify)
-# set(MEMORYCHECK_TYPE BoundsChecker)
-# set(MEMORYCHECK_TYPE ThreadSanitizer)
-# set(MEMORYCHECK_TYPE AddressSanitizer)
-# set(MEMORYCHECK_TYPE LeakSanitizer)
-# set(MEMORYCHECK_TYPE MemorySanitizer)
-# set(MEMORYCHECK_TYPE UndefinedBehaviorSanitizer)
-set(MEMORYCHECK_COMMAND_OPTIONS "--trace-children=yes --leak-check=full")
-
-# ---------------------------------------------------------------------------- #
-# -- Settings
-# ---------------------------------------------------------------------------- #
-## -- Process timeout in seconds
-set(CTEST_TIMEOUT           "7200")
-## -- Set output to English
-set(ENV{LC_MESSAGES}        "en_EN" )
-
-
-# ---------------------------------------------------------------------------- #
-# -- Copy ctest configuration file
-# ---------------------------------------------------------------------------- #
-macro(COPY_CTEST_CONFIG_FILES)
-
-    foreach(_FILE CTestConfig.cmake CTestCustom.cmake)
-
-        # if current directory is not binary or source directory
-        if(NOT "${CMAKE_CURRENT_LIST_DIR}" STREQUAL "${CTEST_BINARY_DIRECTORY}" AND
-           NOT "${CTEST_SOURCE_DIRECTORY}" STREQUAL "${CTEST_BINARY_DIRECTORY}")
-
-            # if file exists in current directory
-            if(EXISTS ${CMAKE_CURRENT_LIST_DIR}/${_FILE})
-                configure_file(${CMAKE_CURRENT_LIST_DIR}/${_FILE}
-                    ${CTEST_BINARY_DIRECTORY}/${_FILE} COPYONLY)
-            endif()
-
-        # if source and binary differ
-        elseif(NOT "${CTEST_SOURCE_DIRECTORY}" STREQUAL "${CTEST_BINARY_DIRECTORY}")
-
-            # if file exists in source directory but not in binary directory
-            if(EXISTS ${CTEST_SOURCE_DIRECTORY}/${_FILE} AND
-               NOT EXISTS ${CTEST_BINARY_DIRECTORY}/${_FILE})
-                configure_file(${CTEST_SOURCE_DIRECTORY}/${_FILE}
-                    ${CTEST_BINARY_DIRECTORY}/${_FILE} COPYONLY)
-            endif()
-
-        endif()
-    endforeach()
-
-endmacro()
-
-ctest_read_custom_files("${CMAKE_CURRENT_LIST_DIR}")
-
-message(STATUS "CTEST_MODEL: ${CTEST_MODEL}")
-
-#-------------------------------------------------------------------------#
-# Start
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running START_CTEST stage...")
-message(STATUS "")
-
-ctest_start(${CTEST_MODEL} TRACK ${CTEST_MODEL} ${APPEND_CTEST}
-    ${CTEST_SOURCE_DIRECTORY} ${CTEST_BINARY_DIRECTORY})
-
-
-#-------------------------------------------------------------------------#
-# Config
-#
-copy_ctest_config_files()
-ctest_read_custom_files("${CTEST_BINARY_DIRECTORY}")
-
-
-#-------------------------------------------------------------------------#
-# Update
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_UPDATE stage...")
-message(STATUS "")
-
-ctest_update(SOURCE "${CTEST_SOURCE_DIRECTORY}"
-    RETURN_VALUE up_ret)
-
-
-#-------------------------------------------------------------------------#
-# Configure
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_CONFIGURE stage...")
-message(STATUS "")
-
-ctest_configure(BUILD "${CTEST_BINARY_DIRECTORY}"
-    SOURCE ${CTEST_SOURCE_DIRECTORY}
-    ${APPEND_CTEST}
-    OPTIONS "${CTEST_CONFIGURE_OPTIONS}"
-    RETURN_VALUE config_ret)
-
-
-#-------------------------------------------------------------------------#
-# Echo configure log bc Damien wants to delay merging this PR for eternity
-#
-file(GLOB _configure_log "${CTEST_BINARY_DIRECTORY}/Testing/Temporary/LastConfigure*.log")
-# should only have one but loop just for safety
-foreach(_LOG ${_configure_log})
-    file(READ ${_LOG} _LOG_MESSAGE)
-    message(STATUS "Configure Log: ${_LOG}")
-    message(STATUS "\n${_LOG_MESSAGE}\n")
-endforeach()
-
-
-#-------------------------------------------------------------------------#
-# Build
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_BUILD stage...")
-message(STATUS "")
-
-ctest_build(BUILD "${CTEST_BINARY_DIRECTORY}"
-    ${APPEND_CTEST}
-    RETURN_VALUE build_ret)
-
-
-#-------------------------------------------------------------------------#
-# Echo build log bc Damien wants to delay merging this PR for eternity
-#
-file(GLOB _build_log "${CTEST_BINARY_DIRECTORY}/Testing/Temporary/LastBuild*.log")
-# should only have one but loop just for safety
-foreach(_LOG ${_build_log})
-    file(READ ${_LOG} _LOG_MESSAGE)
-    message(STATUS "Build Log: ${_LOG}")
-    message(STATUS "\n${_LOG_MESSAGE}\n")
-endforeach()
-
-
-#-------------------------------------------------------------------------#
-# Test
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_TEST stage...")
-message(STATUS "")
-
-ctest_test(RETURN_VALUE test_ret
-    ${APPEND_CTEST}
-    ${START_CTEST}
-    ${END_CTEST}
-    ${STRIDE_CTEST}
-    ${INCLUDE_CTEST}
-    ${EXCLUDE_CTEST}
-    ${INCLUDE_LABEL_CTEST}
-    ${EXCLUDE_LABEL_CTEST}
-    ${PARALLEL_LEVEL_CTEST}
-    ${STOP_TIME_CTEST}
-    SCHEDULE_RANDOM OFF)
-
-
-#-------------------------------------------------------------------------#
-# Coverage
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_COVERAGE stage...")
-message(STATUS "")
-
-execute_process(COMMAND ${CTEST_COVERAGE_COMMAND} ${CTEST_COVERAGE_EXTRA_FLAGS}
-    WORKING_DIRECTORY ${CTEST_BINARY_DIRECTORY}
-    ERROR_QUIET)
-
-ctest_coverage(${APPEND_CTEST}
-    ${CTEST_COVERAGE_LABELS}
-    RETURN_VALUE cov_ret)
-
-
-#-------------------------------------------------------------------------#
-# MemCheck
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_MEMCHECK stage...")
-message(STATUS "")
-
-ctest_memcheck(RETURN_VALUE mem_ret
-    ${APPEND_CTEST}
-    ${START_CTEST}
-    ${END_CTEST}
-    ${STRIDE_CTEST}
-    ${INCLUDE_CTEST}
-    ${EXCLUDE_CTEST}
-    ${INCLUDE_LABEL_CTEST}
-    ${EXCLUDE_LABEL_CTEST}
-    ${PARALLEL_LEVEL_CTEST})
-
-
-#-------------------------------------------------------------------------#
-# Submit
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_SUBMIT stage...")
-message(STATUS "")
-
-file(GLOB_RECURSE NOTE_FILES "${CTEST_BINARY_DIRECTORY}/*CTestNotes.cmake")
-foreach(_FILE ${NOTE_FILES})
-    message(STATUS "Including CTest notes files: \"${_FILE}\"...")
-    include("${_FILE}")
-endforeach()
-
-# capture submit error so it doesn't fail because of a submission error
-ctest_submit(RETURN_VALUE submit_ret
-    RETRY_COUNT 2
-    RETRY_DELAY 10
-    CAPTURE_CMAKE_ERROR submit_err)
-
-#-------------------------------------------------------------------------#
-# Submit
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Finished ${CTEST_MODEL} Stages (${STAGES})")
-message(STATUS "")
-
-
-#-------------------------------------------------------------------------#
-# Non-zero exit codes for important errors
-#
-if(NOT config_ret EQUAL 0)
-    message(FATAL_ERROR "Error during configuration! Exit code: ${config_ret}")
-endif()
-
-if(NOT build_ret EQUAL 0)
-    message(FATAL_ERROR "Error during build! Exit code: ${build_ret}")
-endif()
-
-if(NOT test_ret EQUAL 0)
-    message(FATAL_ERROR "Error during testing! Exit code: ${test_ret}")
-endif()
diff --git a/packages/kokkos/cmake/KokkosCore_config.h.in b/packages/kokkos/cmake/KokkosCore_config.h.in
index 3455b0cb42e78c7e17286c70edd9f19274b8dcfb..07baa0a5f09a708d344a9ef72510baa6f4b8e15b 100644
--- a/packages/kokkos/cmake/KokkosCore_config.h.in
+++ b/packages/kokkos/cmake/KokkosCore_config.h.in
@@ -41,6 +41,7 @@
 #cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA
 #cmakedefine KOKKOS_ENABLE_CUDA_CONSTEXPR
 #cmakedefine KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
+#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC
 #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
 #cmakedefine KOKKOS_ENABLE_HPX_ASYNC_DISPATCH
 #cmakedefine KOKKOS_ENABLE_DEBUG
@@ -49,17 +50,21 @@
 #cmakedefine KOKKOS_ENABLE_COMPILER_WARNINGS
 #cmakedefine KOKKOS_ENABLE_PROFILING_LOAD_PRINT
 #cmakedefine KOKKOS_ENABLE_TUNING
-#cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE
+#cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE_3
+#cmakedefine KOKKOS_ENABLE_DEPRECATION_WARNINGS
 #cmakedefine KOKKOS_ENABLE_LARGE_MEM_TESTS
 #cmakedefine KOKKOS_ENABLE_DUALVIEW_MODIFY_CHECK
 #cmakedefine KOKKOS_ENABLE_COMPLEX_ALIGN
-#cmakedefine KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
+#cmakedefine KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#cmakedefine KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION  // deprecated
+#cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
 
 /* TPL Settings */
 #cmakedefine KOKKOS_ENABLE_HWLOC
 #cmakedefine KOKKOS_USE_LIBRT
 #cmakedefine KOKKOS_ENABLE_HBWSPACE
 #cmakedefine KOKKOS_ENABLE_LIBDL
+#cmakedefine KOKKOS_ENABLE_LIBQUADMATH
 #cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND
 
 #cmakedefine KOKKOS_COMPILER_CUDA_VERSION @KOKKOS_COMPILER_CUDA_VERSION@
@@ -79,6 +84,12 @@
 #cmakedefine KOKKOS_ARCH_POWER8
 #cmakedefine KOKKOS_ARCH_POWER9
 #cmakedefine KOKKOS_ARCH_INTEL_GEN
+#cmakedefine KOKKOS_ARCH_INTEL_DG1
+#cmakedefine KOKKOS_ARCH_INTEL_GEN9
+#cmakedefine KOKKOS_ARCH_INTEL_GEN11
+#cmakedefine KOKKOS_ARCH_INTEL_GEN12LP
+#cmakedefine KOKKOS_ARCH_INTEL_XEHP
+#cmakedefine KOKKOS_ARCH_INTEL_GPU
 #cmakedefine KOKKOS_ARCH_KEPLER
 #cmakedefine KOKKOS_ARCH_KEPLER30
 #cmakedefine KOKKOS_ARCH_KEPLER32
@@ -95,6 +106,7 @@
 #cmakedefine KOKKOS_ARCH_VOLTA70
 #cmakedefine KOKKOS_ARCH_VOLTA72
 #cmakedefine KOKKOS_ARCH_TURING75
+#cmakedefine KOKKOS_ARCH_AMPERE
 #cmakedefine KOKKOS_ARCH_AMPERE80
 #cmakedefine KOKKOS_ARCH_AMPERE86
 #cmakedefine KOKKOS_ARCH_AMD_ZEN
diff --git a/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake b/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake
index 8d58d96415808499dc39d44ad3600f5f5a64368e..0c825c59e04248f2cd76d5faf9c6aa16a663bbb1 100644
--- a/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake
+++ b/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake
@@ -29,7 +29,12 @@ ELSE()
 ENDIF()
 
 include(FindPackageHandleStandardArgs)
-FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUDA DEFAULT_MSG FOUND_CUDART FOUND_CUDA_DRIVER)
+IF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL PGI)
+  SET(KOKKOS_CUDA_ERROR "Using NVHPC as host compiler requires at least CMake 3.20.1")
+ELSE()
+  SET(KOKKOS_CUDA_ERROR DEFAULT_MSG)
+ENDIF()
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUDA ${KOKKOS_CUDA_ERROR} FOUND_CUDART FOUND_CUDA_DRIVER)
 IF (FOUND_CUDA_DRIVER AND FOUND_CUDART)
   KOKKOS_CREATE_IMPORTED_TPL(CUDA INTERFACE
     LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart
diff --git a/packages/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake b/packages/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..be70b711e0f92ac9d99e3c3fdd2770430f6c2b68
--- /dev/null
+++ b/packages/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake
@@ -0,0 +1 @@
+KOKKOS_FIND_IMPORTED(LIBQUADMATH HEADER quadmath.h LIBRARY quadmath)
diff --git a/packages/kokkos/cmake/deps/quadmath.cmake b/packages/kokkos/cmake/deps/quadmath.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..826f5021d359e99c2aed8b695de7b601dabaf453
--- /dev/null
+++ b/packages/kokkos/cmake/deps/quadmath.cmake
@@ -0,0 +1,46 @@
+# @HEADER
+# ************************************************************************
+#
+#                        Kokkos v. 3.0
+#       Copyright (2020) National Technology & Engineering
+#               Solutions of Sandia, LLC (NTESS).
+#
+# Under the terms of Contract DE-NA0003525 with NTESS,
+# the U.S. Government retains certain rights in this software.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+#
+# ************************************************************************
+# @HEADER
+
+KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( quadmath
+  REQUIRED_HEADERS quadmath.h
+  REQUIRED_LIBS_NAMES quadmath
+)
diff --git a/packages/kokkos/cmake/kokkos_arch.cmake b/packages/kokkos/cmake/kokkos_arch.cmake
index e8b85542c633eaea6f63c32ada79c7d7b2402794..c4637339f31fa47e80d9b53a1435b3ddc7641573 100644
--- a/packages/kokkos/cmake/kokkos_arch.cmake
+++ b/packages/kokkos/cmake/kokkos_arch.cmake
@@ -67,8 +67,13 @@ KOKKOS_ARCH_OPTION(ZEN3            HOST "AMD Zen3 architecture")
 KOKKOS_ARCH_OPTION(VEGA900         GPU  "AMD GPU MI25 GFX900")
 KOKKOS_ARCH_OPTION(VEGA906         GPU  "AMD GPU MI50/MI60 GFX906")
 KOKKOS_ARCH_OPTION(VEGA908         GPU  "AMD GPU MI100 GFX908")
+KOKKOS_ARCH_OPTION(VEGA90A         GPU  "" )
 KOKKOS_ARCH_OPTION(INTEL_GEN       GPU  "Intel GPUs Gen9+")
-
+KOKKOS_ARCH_OPTION(INTEL_DG1       GPU  "Intel Iris XeMAX GPU")
+KOKKOS_ARCH_OPTION(INTEL_GEN9      GPU  "Intel GPU Gen9")
+KOKKOS_ARCH_OPTION(INTEL_GEN11     GPU  "Intel GPU Gen11")
+KOKKOS_ARCH_OPTION(INTEL_GEN12LP   GPU  "Intel GPU Gen12LP")
+KOKKOS_ARCH_OPTION(INTEL_XEHP      GPU  "Intel GPU Xe-HP")
 
 
 IF(KOKKOS_ENABLE_COMPILER_WARNINGS)
@@ -76,6 +81,12 @@ IF(KOKKOS_ENABLE_COMPILER_WARNINGS)
     "-Wall" "-Wunused-parameter" "-Wshadow" "-pedantic"
     "-Wsign-compare" "-Wtype-limits" "-Wuninitialized")
 
+  # NOTE KOKKOS_ prefixed variable (all uppercase) is not set yet because TPLs are processed after ARCH
+  IF(Kokkos_ENABLE_LIBQUADMATH)
+    # warning: non-standard suffix on floating constant [-Wpedantic]
+    LIST(REMOVE_ITEM COMMON_WARNINGS "-pedantic")
+  ENDIF()
+
   # OpenMPTarget compilers give erroneous warnings about sign comparison in loops
   IF(KOKKOS_ENABLE_OPENMPTARGET)
     LIST(REMOVE_ITEM COMMON_WARNINGS "-Wsign-compare")
@@ -86,7 +97,7 @@ IF(KOKKOS_ENABLE_COMPILER_WARNINGS)
 
   COMPILER_SPECIFIC_FLAGS(
     COMPILER_ID CMAKE_CXX_COMPILER_ID
-    PGI         NO-VALUE-SPECIFIED
+    NVHPC       NO-VALUE-SPECIFIED
     GNU         ${GNU_WARNINGS}
     DEFAULT     ${COMMON_WARNINGS}
   )
@@ -158,16 +169,18 @@ ENDIF()
 
 IF (KOKKOS_ARCH_ARMV80)
   COMPILER_SPECIFIC_FLAGS(
-    Cray NO-VALUE-SPECIFIED
-    PGI  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    Cray    NO-VALUE-SPECIFIED
+    NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -march=armv8-a
   )
 ENDIF()
 
 IF (KOKKOS_ARCH_ARMV81)
   COMPILER_SPECIFIC_FLAGS(
-    Cray NO-VALUE-SPECIFIED
-    PGI  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    Cray    NO-VALUE-SPECIFIED
+    NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -march=armv8.1-a
   )
 ENDIF()
@@ -175,8 +188,9 @@ ENDIF()
 IF (KOKKOS_ARCH_ARMV8_THUNDERX)
   SET(KOKKOS_ARCH_ARMV80 ON) #Not a cache variable
   COMPILER_SPECIFIC_FLAGS(
-    Cray NO-VALUE-SPECIFIED
-    PGI  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    Cray    NO-VALUE-SPECIFIED
+    NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -march=armv8-a -mtune=thunderx
   )
 ENDIF()
@@ -184,23 +198,28 @@ ENDIF()
 IF (KOKKOS_ARCH_ARMV8_THUNDERX2)
   SET(KOKKOS_ARCH_ARMV81 ON) #Not a cache variable
   COMPILER_SPECIFIC_FLAGS(
-    Cray NO-VALUE-SPECIFIED
-    PGI  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    Cray    NO-VALUE-SPECIFIED
+    NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -mcpu=thunderx2t99 -mtune=thunderx2t99
   )
 ENDIF()
 
 IF (KOKKOS_ARCH_A64FX)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -march=armv8.2-a+sve
-    Clang -march=armv8.2-a+sve -msve-vector-bits=512
-    GCC -march=armv8.2-a+sve -msve-vector-bits=512
+    Clang   -march=armv8.2-a+sve -msve-vector-bits=512
+    GCC     -march=armv8.2-a+sve -msve-vector-bits=512
   )
 ENDIF()
 
 IF (KOKKOS_ARCH_ZEN)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -mavx2
+    NVHPC   -tp=zen
     DEFAULT -march=znver1 -mtune=znver1
   )
   SET(KOKKOS_ARCH_AMD_ZEN  ON)
@@ -209,7 +228,9 @@ ENDIF()
 
 IF (KOKKOS_ARCH_ZEN2)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -mavx2
+    NVHPC   -tp=zen2
     DEFAULT -march=znver2 -mtune=znver2
   )
   SET(KOKKOS_ARCH_AMD_ZEN2 ON)
@@ -218,7 +239,9 @@ ENDIF()
 
 IF (KOKKOS_ARCH_ZEN3)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -mavx2
+    NVHPC   -tp=zen2
     DEFAULT -march=znver3 -mtune=znver3
   )
   SET(KOKKOS_ARCH_AMD_ZEN3 ON)
@@ -227,8 +250,9 @@ ENDIF()
 
 IF (KOKKOS_ARCH_WSM)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -xSSE4.2
-    PGI     -tp=nehalem
+    NVHPC   -tp=px
     Cray    NO-VALUE-SPECIFIED
     DEFAULT -msse4.2
   )
@@ -238,8 +262,9 @@ ENDIF()
 IF (KOKKOS_ARCH_SNB OR KOKKOS_ARCH_AMDAVX)
   SET(KOKKOS_ARCH_AVX ON)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -mavx
-    PGI     -tp=sandybridge
+    NVHPC   -tp=sandybridge
     Cray    NO-VALUE-SPECIFIED
     DEFAULT -mavx
   )
@@ -248,8 +273,9 @@ ENDIF()
 IF (KOKKOS_ARCH_HSW)
   SET(KOKKOS_ARCH_AVX2 ON)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -xCORE-AVX2
-    PGI     -tp=haswell
+    NVHPC   -tp=haswell
     Cray    NO-VALUE-SPECIFIED
     DEFAULT -march=core-avx2 -mtune=core-avx2
   )
@@ -258,8 +284,9 @@ ENDIF()
 IF (KOKKOS_ARCH_BDW)
   SET(KOKKOS_ARCH_AVX2 ON)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -xCORE-AVX2
-    PGI     -tp=haswell
+    NVHPC   -tp=haswell
     Cray    NO-VALUE-SPECIFIED
     DEFAULT -march=core-avx2 -mtune=core-avx2 -mrtm
   )
@@ -269,8 +296,9 @@ IF (KOKKOS_ARCH_KNL)
   #avx512-mic
   SET(KOKKOS_ARCH_AVX512MIC ON) #not a cache variable
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -xMIC-AVX512
-    PGI     NO-VALUE-SPECIFIED
+    NVHPC   -tp=knl
     Cray    NO-VALUE-SPECIFIED
     DEFAULT -march=knl -mtune=knl
   )
@@ -279,6 +307,7 @@ ENDIF()
 IF (KOKKOS_ARCH_KNC)
   SET(KOKKOS_USE_ISA_KNC ON)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     DEFAULT -mmic
   )
 ENDIF()
@@ -287,8 +316,9 @@ IF (KOKKOS_ARCH_SKX)
   #avx512-xeon
   SET(KOKKOS_ARCH_AVX512XEON ON)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -xCORE-AVX512
-    PGI     NO-VALUE-SPECIFIED
+    NVHPC   -tp=skylake
     Cray    NO-VALUE-SPECIFIED
     DEFAULT -march=skylake-avx512 -mtune=skylake-avx512 -mrtm
   )
@@ -304,7 +334,8 @@ ENDIF()
 
 IF (KOKKOS_ARCH_POWER7)
   COMPILER_SPECIFIC_FLAGS(
-    PGI     NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -mcpu=power7 -mtune=power7
   )
   SET(KOKKOS_USE_ISA_POWERPCBE ON)
@@ -312,16 +343,16 @@ ENDIF()
 
 IF (KOKKOS_ARCH_POWER8)
   COMPILER_SPECIFIC_FLAGS(
-    PGI     NO-VALUE-SPECIFIED
-    NVIDIA  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    NVHPC   -tp=pwr8
     DEFAULT -mcpu=power8 -mtune=power8
   )
 ENDIF()
 
 IF (KOKKOS_ARCH_POWER9)
   COMPILER_SPECIFIC_FLAGS(
-    PGI     NO-VALUE-SPECIFIED
-    NVIDIA  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    NVHPC   -tp=pwr9
     DEFAULT -mcpu=power9 -mtune=power9
   )
 ENDIF()
@@ -368,7 +399,7 @@ ENDIF()
 
 IF (KOKKOS_ENABLE_SYCL)
   COMPILER_SPECIFIC_FLAGS(
-    DEFAULT -fsycl
+    DEFAULT -fsycl -fno-sycl-id-queries-fit-in-int
   )
   COMPILER_SPECIFIC_OPTIONS(
     DEFAULT -fsycl-unnamed-lambda
@@ -443,20 +474,58 @@ ENDFUNCTION()
 CHECK_AMDGPU_ARCH(VEGA900 gfx900) # Radeon Instinct MI25
 CHECK_AMDGPU_ARCH(VEGA906 gfx906) # Radeon Instinct MI50 and MI60
 CHECK_AMDGPU_ARCH(VEGA908 gfx908)
+CHECK_AMDGPU_ARCH(VEGA90A gfx90a)
 
 IF(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED)
-  MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture currently enabled. "
-                     "Please enable one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.")
+  IF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC)
+    FIND_PROGRAM(ROCM_ENUMERATOR rocm_agent_enumerator)
+    EXECUTE_PROCESS(COMMAND ${ROCM_ENUMERATOR} OUTPUT_VARIABLE GPU_ARCHS)
+    STRING(LENGTH "${GPU_ARCHS}" len_str)
+    # enumerator always output gfx000 as the first line
+    IF(${len_str} LESS 8)
+      MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture currently enabled. "
+                         "Please enable one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.")
+    ENDIF()
+  ELSE()
+    MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture currently enabled. "
+                       "Please enable one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.")
+  ENDIF()
+ENDIF()
+
+MACRO(CHECK_MULTIPLE_INTEL_ARCH)
+  IF(KOKKOS_ARCH_INTEL_GPU)
+    MESSAGE(FATAL_ERROR "Specifying multiple Intel GPU architectures is not allowed!")
+  ENDIF()
+  SET(KOKKOS_ARCH_INTEL_GPU ON)
+ENDMACRO()
+
+IF(KOKKOS_ARCH_INTEL_GEN)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_DG1)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_GEN9)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_GEN11)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_GEN12LP)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_XEHP)
+  CHECK_MULTIPLE_INTEL_ARCH()
 ENDIF()
 
 IF (KOKKOS_ENABLE_OPENMPTARGET)
   SET(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG})
   IF (CLANG_CUDA_ARCH)
-    STRING(REPLACE "sm_" "cc" PGI_CUDA_ARCH ${CLANG_CUDA_ARCH})
+    STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${CLANG_CUDA_ARCH})
     COMPILER_SPECIFIC_FLAGS(
       Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64-nvidia-cuda
-      XL -qtgtarch=${KOKKOS_CUDA_ARCH_FLAG}
-      PGI -gpu=${PGI_CUDA_ARCH}
+      XL    -qtgtarch=${KOKKOS_CUDA_ARCH_FLAG}
+      NVHPC -gpu=${NVHPC_CUDA_ARCH}
     )
   ENDIF()
   SET(CLANG_AMDGPU_ARCH ${KOKKOS_AMDGPU_ARCH_FLAG})
@@ -465,7 +534,7 @@ IF (KOKKOS_ENABLE_OPENMPTARGET)
       Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${CLANG_AMDGPU_ARCH} -fopenmp-targets=amdgcn-amd-amdhsa
     )
   ENDIF()
-  IF (KOKKOS_ARCH_INTEL_GEN)
+  IF (KOKKOS_ARCH_INTEL_GPU)
     COMPILER_SPECIFIC_FLAGS(
       IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__
     )
@@ -485,7 +554,27 @@ IF (KOKKOS_ENABLE_SYCL)
     ENDIF()
   ELSEIF(KOKKOS_ARCH_INTEL_GEN)
     COMPILER_SPECIFIC_FLAGS(
-      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device skl"
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9-"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_GEN9)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_GEN11)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen11"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen12lp"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_DG1)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device dg1"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_XEHP)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device xehp"
     )
   ENDIF()
 ENDIF()
diff --git a/packages/kokkos/cmake/kokkos_compiler_id.cmake b/packages/kokkos/cmake/kokkos_compiler_id.cmake
index 23847263a952ce0e94fe48c58dbdfc50b228b314..5afed4fb0e7ba0cd2bca8250b6f58e4434f483ec 100644
--- a/packages/kokkos/cmake/kokkos_compiler_id.cmake
+++ b/packages/kokkos/cmake/kokkos_compiler_id.cmake
@@ -137,7 +137,7 @@ SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    Clang      4.0.0 or higher"
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    GCC        5.3.0 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    Intel     17.0.0 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    NVCC      9.2.88 or higher")
-SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    HIPCC      3.8.0 or higher")
+SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    HIPCC      4.2.0 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    PGI         17.4 or higher\n")
 
 IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
@@ -158,13 +158,23 @@ ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
   ENDIF()
   SET(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE)
 ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC)
-  IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 3.8.0)
+  IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 4.2.0)
     MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
   ENDIF()
 ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI)
   IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 17.4)
     MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
   ENDIF()
+  # Treat PGI internally as NVHPC to simplify handling both compilers.
+  # Before CMake 3.20 NVHPC was identified as PGI, nvc++ is
+  # backward-compatible to pgc++.
+  SET(KOKKOS_CXX_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE)
+ENDIF()
+
+IF(NOT DEFINED KOKKOS_CXX_HOST_COMPILER_ID)
+  SET(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID})
+ELSEIF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL PGI)
+  SET(KOKKOS_CXX_HOST_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE)
 ENDIF()
 
 STRING(REPLACE "." ";" VERSION_LIST ${KOKKOS_CXX_COMPILER_VERSION})
diff --git a/packages/kokkos/cmake/kokkos_enable_devices.cmake b/packages/kokkos/cmake/kokkos_enable_devices.cmake
index d7f83ddbdf877b672cfc196f89d6b3f61d109087..7fd0794036454da9d8a246fd4a3a19fe2e5cf0ef 100644
--- a/packages/kokkos/cmake/kokkos_enable_devices.cmake
+++ b/packages/kokkos/cmake/kokkos_enable_devices.cmake
@@ -62,7 +62,7 @@ IF(KOKKOS_ENABLE_OPENMP)
       COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
       Clang      -Xcompiler ${ClangOpenMPFlag}
       IntelLLVM  -Xcompiler -fiopenmp
-      PGI        -Xcompiler -mp
+      NVHPC      -Xcompiler -mp
       Cray       NO-VALUE-SPECIFIED
       XL         -Xcompiler -qsmp=omp
       DEFAULT    -Xcompiler -fopenmp
@@ -72,7 +72,7 @@ IF(KOKKOS_ENABLE_OPENMP)
       Clang      ${ClangOpenMPFlag}
       IntelLLVM  -fiopenmp
       AppleClang -Xpreprocessor -fopenmp
-      PGI        -mp
+      NVHPC      -mp
       Cray       NO-VALUE-SPECIFIED
       XL         -qsmp=omp
       DEFAULT    -fopenmp
@@ -94,7 +94,7 @@ IF (KOKKOS_ENABLE_OPENMPTARGET)
     Clang      ${ClangOpenMPFlag} -Wno-openmp-mapping
     IntelLLVM  -fiopenmp -Wno-openmp-mapping
     XL         -qsmp=omp -qoffload -qnoeh
-    PGI        -mp=gpu
+    NVHPC      -mp=gpu
     DEFAULT    -fopenmp
   )
   COMPILER_SPECIFIC_DEFS(
diff --git a/packages/kokkos/cmake/kokkos_enable_options.cmake b/packages/kokkos/cmake/kokkos_enable_options.cmake
index 95bce66c7bee32f8800cbd6e0324f9d4c599c97c..4cb8bd20f5ecb3e519ef64d9e1c31c0a5cb7e431 100644
--- a/packages/kokkos/cmake/kokkos_enable_options.cmake
+++ b/packages/kokkos/cmake/kokkos_enable_options.cmake
@@ -26,9 +26,16 @@ KOKKOS_CFG_DEPENDS(OPTIONS COMPILER_ID)
 # Put a check in just in case people are using this option
 KOKKOS_DEPRECATED_LIST(OPTIONS ENABLE)
 
+# Set the Default for Desul Atomics usage.
+set(_DESUL_ATOMICS_DEFAULT ON)
+
 KOKKOS_ENABLE_OPTION(CUDA_RELOCATABLE_DEVICE_CODE  OFF "Whether to enable relocatable device code (RDC) for CUDA")
 KOKKOS_ENABLE_OPTION(CUDA_UVM             OFF "Whether to use unified memory (UM) for CUDA by default")
 KOKKOS_ENABLE_OPTION(CUDA_LDG_INTRINSIC   OFF "Whether to use CUDA LDG intrinsics")
+# As of 08/12/2021 CudaMallocAsync causes issues if UCX is used as MPI communication layer.
+KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC      OFF  "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)")
+KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_3    ON "Whether code deprecated in major release 3 is available" )
+KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" )
 KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE  OFF "Whether to enable relocatable device code (RDC) for HIP")
 KOKKOS_ENABLE_OPTION(HPX_ASYNC_DISPATCH   OFF "Whether HPX supports asynchronous dispatch")
 KOKKOS_ENABLE_OPTION(TESTS         OFF  "Whether to build the unit tests")
@@ -50,6 +57,9 @@ KOKKOS_ENABLE_OPTION(TUNING               OFF "Whether to create bindings for tu
 KOKKOS_ENABLE_OPTION(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops")
 KOKKOS_ENABLE_OPTION(LAUNCH_COMPILER      ON  "Whether to potentially use the launch compiler")
 
+# This option will go away eventually, but allows fallback to old implementation when needed.
+KOKKOS_ENABLE_OPTION(IMPL_DESUL_ATOMICS   ON  "Whether to use desul based atomics - option only during beta")
+
 IF (KOKKOS_ENABLE_CUDA)
   SET(KOKKOS_COMPILER_CUDA_VERSION "${KOKKOS_COMPILER_VERSION_MAJOR}${KOKKOS_COMPILER_VERSION_MINOR}")
 ENDIF()
diff --git a/packages/kokkos/cmake/kokkos_functions.cmake b/packages/kokkos/cmake/kokkos_functions.cmake
index e1a3e5f8bd00802f465390f332138bbadd4f1a33..02c9a911b1b827994b7d4a1e0c004cfb55afd749 100644
--- a/packages/kokkos/cmake/kokkos_functions.cmake
+++ b/packages/kokkos/cmake/kokkos_functions.cmake
@@ -773,7 +773,7 @@ FUNCTION(kokkos_link_tpl TARGET)
 ENDFUNCTION()
 
 FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER)
-  SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelLLVM GNU HIPCC Fujitsu)
+  SET(COMPILERS NVIDIA NVHPC XL XLClang DEFAULT Cray Intel Clang AppleClang IntelLLVM GNU HIPCC Fujitsu)
   CMAKE_PARSE_ARGUMENTS(
     PARSE
     "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES"
diff --git a/packages/kokkos/cmake/kokkos_test_cxx_std.cmake b/packages/kokkos/cmake/kokkos_test_cxx_std.cmake
index 707fb000af528694780d6668f160a3fee3472a69..1eb0592c7f054185e566f053faa931029f92fbc1 100644
--- a/packages/kokkos/cmake/kokkos_test_cxx_std.cmake
+++ b/packages/kokkos/cmake/kokkos_test_cxx_std.cmake
@@ -140,7 +140,7 @@ IF (NOT KOKKOS_CXX_STANDARD_FEATURE)
   IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray)
     INCLUDE(${KOKKOS_SRC_PATH}/cmake/cray.cmake)
     kokkos_set_cray_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD})
-  ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI)
+  ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
     INCLUDE(${KOKKOS_SRC_PATH}/cmake/pgi.cmake)
     kokkos_set_pgi_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD})
   ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel)
diff --git a/packages/kokkos/cmake/kokkos_tpls.cmake b/packages/kokkos/cmake/kokkos_tpls.cmake
index d8d044c9d75384a1d8d312a94708623c735d121f..51bad521c4878c00b6b8c7587d7233c26a1d4ba9 100644
--- a/packages/kokkos/cmake/kokkos_tpls.cmake
+++ b/packages/kokkos/cmake/kokkos_tpls.cmake
@@ -67,6 +67,12 @@ SET(PTHREAD_DEFAULT OFF)
 ENDIF()
 KOKKOS_TPL_OPTION(PTHREAD ${PTHREAD_DEFAULT} TRIBITS Pthread)
 
+IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_quadmath)
+  SET(LIBQUADMATH_DEFAULT ON)
+ELSE()
+  SET(LIBQUADMATH_DEFAULT OFF)
+ENDIF()
+KOKKOS_TPL_OPTION(LIBQUADMATH ${LIBQUADMATH_DEFAULT} TRIBITS quadmath)
 
 #Make sure we use our local FindKokkosCuda.cmake
 KOKKOS_IMPORT_TPL(HPX INTERFACE)
@@ -78,6 +84,7 @@ KOKKOS_IMPORT_TPL(LIBDL)
 KOKKOS_IMPORT_TPL(MEMKIND)
 KOKKOS_IMPORT_TPL(PTHREAD INTERFACE)
 KOKKOS_IMPORT_TPL(ROCM INTERFACE)
+KOKKOS_IMPORT_TPL(LIBQUADMATH)
 
 #Convert list to newlines (which CMake doesn't always like in cache variables)
 STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}")
diff --git a/packages/kokkos/cmake/tpls/FindTPLquadmath.cmake b/packages/kokkos/cmake/tpls/FindTPLquadmath.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..1f7587da808fd587f6380079f9a672f124b3a25b
--- /dev/null
+++ b/packages/kokkos/cmake/tpls/FindTPLquadmath.cmake
@@ -0,0 +1,46 @@
+# @HEADER
+# ************************************************************************
+#
+#                        Kokkos v. 3.0
+#       Copyright (2020) National Technology & Engineering
+#               Solutions of Sandia, LLC (NTESS).
+#
+# Under the terms of Contract DE-NA0003525 with NTESS,
+# the U.S. Government retains certain rights in this software.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+#
+# ************************************************************************
+# @HEADER
+
+TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( quadmath
+  REQUIRED_HEADERS quadmath.h
+  REQUIRED_LIBS_NAMES quadmath
+)
diff --git a/packages/kokkos/containers/performance_tests/TestDynRankView.hpp b/packages/kokkos/containers/performance_tests/TestDynRankView.hpp
index 8c507c76621d09b134ad94f12da589e8c31a014c..7ed9a0271a51db453ff29a982e57cd17d70a642d 100644
--- a/packages/kokkos/containers/performance_tests/TestDynRankView.hpp
+++ b/packages/kokkos/containers/performance_tests/TestDynRankView.hpp
@@ -48,7 +48,7 @@
 #include <Kokkos_DynRankView.hpp>
 #include <vector>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 // Compare performance of DynRankView to View, specific focus on the parenthesis
 // operators
diff --git a/packages/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp b/packages/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
index 65de551b2715f1eb31f4385fa0cb2a455bca6a4f..16b74a4997e5f1e643e095e167253829d47a050a 100644
--- a/packages/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
+++ b/packages/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
@@ -48,7 +48,7 @@
 #include <vector>
 #include <algorithm>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 // This test will simulate global ids
 
diff --git a/packages/kokkos/containers/performance_tests/TestScatterView.hpp b/packages/kokkos/containers/performance_tests/TestScatterView.hpp
index 0f3ba103efc5d09d012e3cc35cbfa41fa8be9170..8a23f59d32cdd4f6290465ad41fa70d521e39bfb 100644
--- a/packages/kokkos/containers/performance_tests/TestScatterView.hpp
+++ b/packages/kokkos/containers/performance_tests/TestScatterView.hpp
@@ -46,7 +46,7 @@
 #define KOKKOS_TEST_SCATTER_VIEW_HPP
 
 #include <Kokkos_ScatterView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 namespace Perf {
 
diff --git a/packages/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp b/packages/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
index c31412552ad696ada0dad4fd1058f76290282256..4547d5c35758e2eadc0e5029779f0d2e23fc4081 100644
--- a/packages/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
+++ b/packages/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
@@ -43,7 +43,7 @@
 #ifndef KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP
 #define KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 #include <iostream>
 #include <iomanip>
diff --git a/packages/kokkos/containers/src/Kokkos_Bitset.hpp b/packages/kokkos/containers/src/Kokkos_Bitset.hpp
index ea1d6dde5d26e9baf719281a0d8f13bb80ec59f8..c5b66f05a3ce0b7778fdcbc8e7a3e766301273d0 100644
--- a/packages/kokkos/containers/src/Kokkos_Bitset.hpp
+++ b/packages/kokkos/containers/src/Kokkos_Bitset.hpp
@@ -76,20 +76,25 @@ class Bitset {
   using execution_space = Device;
   using size_type       = unsigned int;
 
-  enum { BIT_SCAN_REVERSE = 1u };
-  enum { MOVE_HINT_BACKWARD = 2u };
-
-  enum {
-    BIT_SCAN_FORWARD_MOVE_HINT_FORWARD  = 0u,
-    BIT_SCAN_REVERSE_MOVE_HINT_FORWARD  = BIT_SCAN_REVERSE,
-    BIT_SCAN_FORWARD_MOVE_HINT_BACKWARD = MOVE_HINT_BACKWARD,
-    BIT_SCAN_REVERSE_MOVE_HINT_BACKWARD = BIT_SCAN_REVERSE | MOVE_HINT_BACKWARD
-  };
+  static constexpr unsigned BIT_SCAN_REVERSE   = 1u;
+  static constexpr unsigned MOVE_HINT_BACKWARD = 2u;
+
+  static constexpr unsigned BIT_SCAN_FORWARD_MOVE_HINT_FORWARD = 0u;
+  static constexpr unsigned BIT_SCAN_REVERSE_MOVE_HINT_FORWARD =
+      BIT_SCAN_REVERSE;
+  static constexpr unsigned BIT_SCAN_FORWARD_MOVE_HINT_BACKWARD =
+      MOVE_HINT_BACKWARD;
+  static constexpr unsigned BIT_SCAN_REVERSE_MOVE_HINT_BACKWARD =
+      BIT_SCAN_REVERSE | MOVE_HINT_BACKWARD;
 
  private:
-  enum { block_size = static_cast<unsigned>(sizeof(unsigned) * CHAR_BIT) };
-  enum { block_mask = block_size - 1u };
-  enum { block_shift = Kokkos::Impl::integral_power_of_two(block_size) };
+  enum : unsigned {
+    block_size = static_cast<unsigned>(sizeof(unsigned) * CHAR_BIT)
+  };
+  enum : unsigned { block_mask = block_size - 1u };
+  enum : unsigned {
+    block_shift = Kokkos::Impl::integral_power_of_two(block_size)
+  };
 
  public:
   /// constructor
@@ -317,14 +322,18 @@ class ConstBitset {
   enum { block_shift = Kokkos::Impl::integral_power_of_two(block_size) };
 
  public:
+  KOKKOS_FUNCTION
   ConstBitset() : m_size(0) {}
 
+  KOKKOS_FUNCTION
   ConstBitset(Bitset<Device> const& rhs)
       : m_size(rhs.m_size), m_blocks(rhs.m_blocks) {}
 
+  KOKKOS_FUNCTION
   ConstBitset(ConstBitset<Device> const& rhs)
       : m_size(rhs.m_size), m_blocks(rhs.m_blocks) {}
 
+  KOKKOS_FUNCTION
   ConstBitset<Device>& operator=(Bitset<Device> const& rhs) {
     this->m_size   = rhs.m_size;
     this->m_blocks = rhs.m_blocks;
@@ -332,6 +341,7 @@ class ConstBitset {
     return *this;
   }
 
+  KOKKOS_FUNCTION
   ConstBitset<Device>& operator=(ConstBitset<Device> const& rhs) {
     this->m_size   = rhs.m_size;
     this->m_blocks = rhs.m_blocks;
diff --git a/packages/kokkos/containers/src/Kokkos_DualView.hpp b/packages/kokkos/containers/src/Kokkos_DualView.hpp
index 45710d1f737ca14348dd79d698bbc4a581225bbb..f55d0f2b7f3f10b43ea4ee076dc4dea191010449 100644
--- a/packages/kokkos/containers/src/Kokkos_DualView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_DualView.hpp
@@ -597,8 +597,10 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
     }
     if (std::is_same<typename t_host::memory_space,
                      typename t_dev::memory_space>::value) {
-      typename t_dev::execution_space().fence();
-      typename t_host::execution_space().fence();
+      typename t_dev::execution_space().fence(
+          "Kokkos::DualView<>::sync: fence after syncing DualView");
+      typename t_host::execution_space().fence(
+          "Kokkos::DualView<>::sync: fence after syncing DualView");
     }
   }
 
@@ -776,10 +778,11 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
   /// If \c Device is the same as this DualView's device type, then
   /// mark the device's data as modified.  Otherwise, mark the host's
   /// data as modified.
-  template <class Device>
+  template <class Device, class Dummy = DualView,
+            std::enable_if_t<!Dummy::impl_dualview_is_single_device::value>* =
+                nullptr>
   void modify() {
     if (modified_flags.data() == nullptr) return;
-    if (impl_dualview_is_single_device::value) return;
     int dev = get_device_side<Device>();
 
     if (dev == 1) {  // if Device is the same as DualView's device type
@@ -811,8 +814,17 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
 #endif
   }
 
+  template <
+      class Device, class Dummy = DualView,
+      std::enable_if_t<Dummy::impl_dualview_is_single_device::value>* = nullptr>
+  void modify() {
+    return;
+  }
+
+  template <class Dummy = DualView,
+            std::enable_if_t<!Dummy::impl_dualview_is_single_device::value>* =
+                nullptr>
   inline void modify_host() {
-    if (impl_dualview_is_single_device::value) return;
     if (modified_flags.data() != nullptr) {
       modified_flags(0) =
           (modified_flags(1) > modified_flags(0) ? modified_flags(1)
@@ -832,8 +844,17 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
     }
   }
 
+  template <
+      class Dummy = DualView,
+      std::enable_if_t<Dummy::impl_dualview_is_single_device::value>* = nullptr>
+  inline void modify_host() {
+    return;
+  }
+
+  template <class Dummy = DualView,
+            std::enable_if_t<!Dummy::impl_dualview_is_single_device::value>* =
+                nullptr>
   inline void modify_device() {
-    if (impl_dualview_is_single_device::value) return;
     if (modified_flags.data() != nullptr) {
       modified_flags(1) =
           (modified_flags(1) > modified_flags(0) ? modified_flags(1)
@@ -853,6 +874,13 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
     }
   }
 
+  template <
+      class Dummy = DualView,
+      std::enable_if_t<Dummy::impl_dualview_is_single_device::value>* = nullptr>
+  inline void modify_device() {
+    return;
+  }
+
   inline void clear_sync_state() {
     if (modified_flags.data() != nullptr)
       modified_flags(1) = modified_flags(0) = 0;
@@ -875,8 +903,15 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
                const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
                const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
                const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
-    ::Kokkos::realloc(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
-    h_view = create_mirror_view(d_view);
+    const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
+    const bool sizeMismatch =
+        Impl::size_mismatch(h_view, h_view.rank_dynamic, new_extents);
+
+    if (sizeMismatch) {
+      ::Kokkos::realloc(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
+      h_view = create_mirror_view(d_view);
+    } else
+      ::Kokkos::deep_copy(d_view, typename t_dev::value_type{});
 
     /* Reset dirty flags */
     if (modified_flags.data() == nullptr) {
@@ -897,41 +932,31 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
               const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
               const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
               const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
+    const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
+    const bool sizeMismatch =
+        Impl::size_mismatch(h_view, h_view.rank_dynamic, new_extents);
+
     if (modified_flags.data() == nullptr) {
       modified_flags = t_modified_flags("DualView::modified_flags");
     }
     if (modified_flags(1) >= modified_flags(0)) {
       /* Resize on Device */
-      ::Kokkos::resize(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
-      h_view = create_mirror_view(d_view);
-
-      /* Mark Device copy as modified */
-      modified_flags(1) = modified_flags(1) + 1;
+      if (sizeMismatch) {
+        ::Kokkos::resize(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
+        h_view = create_mirror_view(d_view);
 
+        /* Mark Device copy as modified */
+        modified_flags(1) = modified_flags(1) + 1;
+      }
     } else {
       /* Realloc on Device */
-
-      ::Kokkos::realloc(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
-
-      const bool sizeMismatch =
-          (h_view.extent(0) != n0) || (h_view.extent(1) != n1) ||
-          (h_view.extent(2) != n2) || (h_view.extent(3) != n3) ||
-          (h_view.extent(4) != n4) || (h_view.extent(5) != n5) ||
-          (h_view.extent(6) != n6) || (h_view.extent(7) != n7);
-      if (sizeMismatch)
+      if (sizeMismatch) {
         ::Kokkos::resize(h_view, n0, n1, n2, n3, n4, n5, n6, n7);
+        d_view = create_mirror_view(typename t_dev::execution_space(), h_view);
 
-      t_host temp_view = create_mirror_view(d_view);
-
-      /* Remap on Host */
-      Kokkos::deep_copy(temp_view, h_view);
-
-      h_view = temp_view;
-
-      d_view = create_mirror_view(typename t_dev::execution_space(), h_view);
-
-      /* Mark Host copy as modified */
-      modified_flags(0) = modified_flags(0) + 1;
+        /* Mark Host copy as modified */
+        modified_flags(0) = modified_flags(0) + 1;
+      }
     }
   }
 
diff --git a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp
index c6323fef93694de1ee39d5784141bf6991f78bd7..b673c53a4ef8e8a760c613332418ae5d600a6812 100644
--- a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp
@@ -1140,7 +1140,8 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
     // to avoid incomplete type errors from usng Kokkos::Cuda directly.
     if (std::is_same<Kokkos::CudaUVMSpace,
                      typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::DynRankView<>::DynRankView: fence before UVM allocation");
     }
 #endif
     //------------------------------------------------------------
@@ -1154,7 +1155,8 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
 #if defined(KOKKOS_ENABLE_CUDA)
     if (std::is_same<Kokkos::CudaUVMSpace,
                      typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::DynRankView<>::DynRankView: fence after UVM allocation");
     }
 #endif
     //------------------------------------------------------------
@@ -1404,7 +1406,7 @@ class ViewMapping<
 
   template <class MemoryTraits>
   struct apply {
-    static_assert(Kokkos::Impl::is_memory_traits<MemoryTraits>::value, "");
+    static_assert(Kokkos::is_memory_traits<MemoryTraits>::value, "");
 
     using traits_type =
         Kokkos::ViewTraits<data_type, array_layout,
@@ -1574,7 +1576,7 @@ KOKKOS_INLINE_FUNCTION bool operator!=(const DynRankView<LT, LP...>& lhs,
 namespace Kokkos {
 namespace Impl {
 
-template <class OutputView, typename Enable = void>
+template <class OutputView, class Enable = void>
 struct DynRankViewFill {
   using const_value_type = typename OutputView::traits::const_value_type;
 
@@ -1693,9 +1695,11 @@ inline void deep_copy(
                    typename ViewTraits<DT, DP...>::value_type>::value,
       "deep_copy requires non-const type");
 
-  Kokkos::fence();
+  Kokkos::fence(
+      "Kokkos::deep_copy(DynRankView, value_type): fence before filling view");
   Kokkos::Impl::DynRankViewFill<DynRankView<DT, DP...> >(dst, value);
-  Kokkos::fence();
+  Kokkos::fence(
+      "Kokkos::deep_copy(DynRankView, value_type): fence after filling view");
 }
 
 /** \brief  Deep copy into a value in Host memory from a view.  */
@@ -1711,10 +1715,13 @@ inline void deep_copy(
 
   using src_traits       = ViewTraits<ST, SP...>;
   using src_memory_space = typename src_traits::memory_space;
-  Kokkos::fence();
+  Kokkos::fence(
+      "Kokkos::deep_copy(value_type, DynRankView): fence before copying "
+      "value");
   Kokkos::Impl::DeepCopy<HostSpace, src_memory_space>(&dst, src.data(),
                                                       sizeof(ST));
-  Kokkos::fence();
+  Kokkos::fence(
+      "Kokkos::deep_copy(value_type, DynRankView): fence after copying value");
 }
 
 //----------------------------------------------------------------------------
@@ -1744,14 +1751,14 @@ inline void deep_copy(
 
   enum {
     DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
   };
 
   enum {
     SrcExecCanAccessDst =
-        Kokkos::Impl::SpaceAccessibility<src_execution_space,
-                                         dst_memory_space>::accessible
+        Kokkos::SpaceAccessibility<src_execution_space,
+                                   dst_memory_space>::accessible
   };
 
   if ((void*)dst.data() != (void*)src.data()) {
@@ -1762,10 +1769,14 @@ inline void deep_copy(
     // memory then can byte-wise copy
     if (rank(src) == 0 && rank(dst) == 0) {
       using value_type = typename dst_type::value_type;
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "copying rank-0 views");
       Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
           dst.data(), src.data(), sizeof(value_type));
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "copying rank-0 views");
     } else if (std::is_same<
                    typename DstType::traits::value_type,
                    typename SrcType::traits::non_const_value_type>::value &&
@@ -1787,10 +1798,14 @@ inline void deep_copy(
                dst.extent(6) == src.extent(6) &&
                dst.extent(7) == src.extent(7)) {
       const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "copying rank-1 views");
       Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
           dst.data(), src.data(), nbytes);
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "copying rank-1 views");
     } else if (std::is_same<
                    typename DstType::traits::value_type,
                    typename SrcType::traits::non_const_value_type>::value &&
@@ -1817,29 +1832,43 @@ inline void deep_copy(
                dst.stride_6() == src.stride_6() &&
                dst.stride_7() == src.stride_7()) {
       const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "copying rank-1 views");
       Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
           dst.data(), src.data(), nbytes);
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "copying rank-1 views");
     } else if (DstExecCanAccessSrc) {
       // Copying data between views in accessible memory spaces and either
       // non-contiguous or incompatible shape.
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "remapping views of incompatible shape");
       Kokkos::Impl::DynRankViewRemap<dst_type, src_type>(dst, src);
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "remapping views of incompatible shape");
     } else if (SrcExecCanAccessDst) {
       // Copying data between views in accessible memory spaces and either
       // non-contiguous or incompatible shape.
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "remapping views of incompatible shape");
       Kokkos::Impl::DynRankViewRemap<dst_type, src_type, src_execution_space>(
           dst, src);
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "remapping views of incompatible shape");
     } else {
       Kokkos::Impl::throw_runtime_exception(
           "deep_copy given views that would require a temporary allocation");
     }
   } else {
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence due to same "
+        "src and dst");
   }
 }
 
diff --git a/packages/kokkos/containers/src/Kokkos_DynamicView.hpp b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp
index cc949d4c556ab4abd982ea5334fee870c42ef305..2c764f535c585a4f545300d619b83917f327f414 100644
--- a/packages/kokkos/containers/src/Kokkos_DynamicView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_DynamicView.hpp
@@ -53,36 +53,201 @@
 namespace Kokkos {
 namespace Experimental {
 
-// Simple metafunction for choosing memory space
-// In the current implementation, if memory_space == CudaSpace,
-// use CudaUVMSpace for the chunk 'array' allocation, which
-// contains will contain pointers to chunks of memory allocated
-// in CudaSpace
 namespace Impl {
-template <class MemSpace>
-struct ChunkArraySpace {
-  using memory_space = MemSpace;
-};
 
-#ifdef KOKKOS_ENABLE_CUDA
-template <>
-struct ChunkArraySpace<Kokkos::CudaSpace> {
-  using memory_space = typename Kokkos::CudaUVMSpace;
-};
-#endif
-#ifdef KOKKOS_ENABLE_HIP
-template <>
-struct ChunkArraySpace<Kokkos::Experimental::HIPSpace> {
-  using memory_space = typename Kokkos::Experimental::HIPHostPinnedSpace;
-};
-#endif
-#ifdef KOKKOS_ENABLE_SYCL
-template <>
-struct ChunkArraySpace<Kokkos::Experimental::SYCLDeviceUSMSpace> {
-  using memory_space = typename Kokkos::Experimental::SYCLSharedUSMSpace;
+/// Utility class to manage memory for chunked arrays on the host and
+/// device. Allocates/deallocates memory on both the host and device along with
+/// providing utilities for creating mirrors and deep copying between them.
+template <typename MemorySpace, typename ValueType>
+struct ChunkedArrayManager {
+  using value_type   = ValueType;
+  using pointer_type = ValueType*;
+  using track_type   = Kokkos::Impl::SharedAllocationTracker;
+
+  ChunkedArrayManager()                           = default;
+  ChunkedArrayManager(ChunkedArrayManager const&) = default;
+  ChunkedArrayManager(ChunkedArrayManager&&)      = default;
+  ChunkedArrayManager& operator=(ChunkedArrayManager&&) = default;
+  ChunkedArrayManager& operator=(const ChunkedArrayManager&) = default;
+
+  template <typename Space, typename Value>
+  friend struct ChunkedArrayManager;
+
+  template <typename Space, typename Value>
+  inline ChunkedArrayManager(const ChunkedArrayManager<Space, Value>& rhs)
+      : m_valid(rhs.m_valid),
+        m_chunk_max(rhs.m_chunk_max),
+        m_chunks((ValueType**)(rhs.m_chunks)),
+        m_track(rhs.m_track),
+        m_chunk_size(rhs.m_chunk_size) {
+    static_assert(
+        Kokkos::Impl::MemorySpaceAccess<MemorySpace, Space>::assignable,
+        "Incompatible ChunkedArrayManager copy construction");
+  }
+
+  ChunkedArrayManager(const unsigned arg_chunk_max,
+                      const unsigned arg_chunk_size)
+      : m_chunk_max(arg_chunk_max), m_chunk_size(arg_chunk_size) {}
+
+ private:
+  struct ACCESSIBLE_TAG {};
+  struct INACCESSIBLE_TAG {};
+
+  ChunkedArrayManager(ACCESSIBLE_TAG, pointer_type* arg_chunks,
+                      const unsigned arg_chunk_max)
+      : m_valid(true), m_chunk_max(arg_chunk_max), m_chunks(arg_chunks) {}
+
+  ChunkedArrayManager(INACCESSIBLE_TAG, const unsigned arg_chunk_max,
+                      const unsigned arg_chunk_size)
+      : m_chunk_max(arg_chunk_max), m_chunk_size(arg_chunk_size) {}
+
+ public:
+  template <typename Space, typename Enable_ = void>
+  struct IsAccessibleFrom;
+
+  template <typename Space>
+  struct IsAccessibleFrom<
+      Space, typename std::enable_if_t<Kokkos::Impl::MemorySpaceAccess<
+                 MemorySpace, Space>::accessible>> : std::true_type {};
+
+  template <typename Space>
+  struct IsAccessibleFrom<
+      Space, typename std::enable_if_t<!Kokkos::Impl::MemorySpaceAccess<
+                 MemorySpace, Space>::accessible>> : std::false_type {};
+
+  template <typename Space>
+  static ChunkedArrayManager<Space, ValueType> create_mirror(
+      ChunkedArrayManager<MemorySpace, ValueType> const& other,
+      typename std::enable_if<IsAccessibleFrom<Space>::value>::type* =
+          nullptr) {
+    return ChunkedArrayManager<Space, ValueType>{
+        ACCESSIBLE_TAG{}, other.m_chunks, other.m_chunk_max};
+  }
+
+  template <typename Space>
+  static ChunkedArrayManager<Space, ValueType> create_mirror(
+      ChunkedArrayManager<MemorySpace, ValueType> const& other,
+      typename std::enable_if<!IsAccessibleFrom<Space>::value>::type* =
+          nullptr) {
+    using tag_type =
+        typename ChunkedArrayManager<Space, ValueType>::INACCESSIBLE_TAG;
+    return ChunkedArrayManager<Space, ValueType>{tag_type{}, other.m_chunk_max,
+                                                 other.m_chunk_size};
+  }
+
+ public:
+  void allocate_device(const std::string& label) {
+    if (m_chunks == nullptr) {
+      m_chunks = reinterpret_cast<pointer_type*>(MemorySpace().allocate(
+          label.c_str(), (sizeof(pointer_type) * (m_chunk_max + 2))));
+    }
+  }
+
+  void initialize() {
+    for (unsigned i = 0; i < m_chunk_max + 2; i++) {
+      m_chunks[i] = nullptr;
+    }
+    m_valid = true;
+  }
+
+ private:
+  /// Custom destroy functor for deallocating array chunks along with a linked
+  /// allocation
+  template <typename Space>
+  struct Destroy {
+    Destroy()               = default;
+    Destroy(Destroy&&)      = default;
+    Destroy(const Destroy&) = default;
+    Destroy& operator=(Destroy&&) = default;
+    Destroy& operator=(const Destroy&) = default;
+
+    Destroy(std::string label, value_type** arg_chunk,
+            const unsigned arg_chunk_max, const unsigned arg_chunk_size,
+            value_type** arg_linked)
+        : m_label(label),
+          m_chunks(arg_chunk),
+          m_linked(arg_linked),
+          m_chunk_max(arg_chunk_max),
+          m_chunk_size(arg_chunk_size) {}
+
+    void execute() {
+      // Destroy the array of chunk pointers.
+      // Two entries beyond the max chunks are allocation counters.
+      for (unsigned i = 0; i < m_chunk_max; i++) {
+        Space().deallocate(m_label.c_str(), m_chunks[i],
+                           sizeof(value_type) * m_chunk_size);
+      }
+      // Destroy the linked allocation if we have one.
+      if (m_linked != nullptr) {
+        Space().deallocate(m_label.c_str(), m_linked,
+                           (sizeof(value_type*) * (m_chunk_max + 2)));
+      }
+    }
+
+    void destroy_shared_allocation() { execute(); }
+
+    std::string m_label;
+    value_type** m_chunks = nullptr;
+    value_type** m_linked = nullptr;
+    unsigned m_chunk_max;
+    unsigned m_chunk_size;
+  };
+
+ public:
+  template <typename Space>
+  void allocate_with_destroy(const std::string& label,
+                             pointer_type* linked_allocation = nullptr) {
+    using destroy_type = Destroy<Space>;
+    using record_type =
+        Kokkos::Impl::SharedAllocationRecord<MemorySpace, destroy_type>;
+
+    // Allocate + 2 extra slots so that *m_chunk[m_chunk_max] ==
+    // num_chunks_alloc and *m_chunk[m_chunk_max+1] == extent This must match in
+    // Destroy's execute(...) method
+    record_type* const record = record_type::allocate(
+        MemorySpace(), label, (sizeof(pointer_type) * (m_chunk_max + 2)));
+    m_chunks = static_cast<pointer_type*>(record->data());
+    m_track.assign_allocated_record_to_uninitialized(record);
+
+    record->m_destroy = destroy_type(label, m_chunks, m_chunk_max, m_chunk_size,
+                                     linked_allocation);
+  }
+
+  pointer_type* get_ptr() const { return m_chunks; }
+
+  template <typename Space>
+  typename std::enable_if<!IsAccessibleFrom<Space>::value>::type deep_copy_to(
+      ChunkedArrayManager<Space, ValueType> const& other) {
+    Kokkos::Impl::DeepCopy<Space, MemorySpace>(
+        other.m_chunks, m_chunks, sizeof(pointer_type) * (m_chunk_max + 2));
+  }
+
+  template <typename Space>
+  typename std::enable_if<IsAccessibleFrom<Space>::value>::type deep_copy_to(
+      ChunkedArrayManager<Space, ValueType> const&) {
+    // no-op
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  pointer_type* operator+(int i) const { return m_chunks + i; }
+
+  KOKKOS_INLINE_FUNCTION
+  pointer_type& operator[](int i) const { return m_chunks[i]; }
+
+  track_type const& track() const { return m_track; }
+
+  KOKKOS_INLINE_FUNCTION
+  bool valid() const { return m_valid; }
+
+ private:
+  bool m_valid           = false;
+  unsigned m_chunk_max   = 0;
+  pointer_type* m_chunks = nullptr;
+  track_type m_track;
+  unsigned m_chunk_size = 0;
 };
-#endif
-}  // end namespace Impl
+
+} /* end namespace Impl */
 
 /** \brief Dynamic views are restricted to rank-one and no layout.
  *         Resize only occurs on host outside of parallel_regions.
@@ -93,6 +258,13 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
  public:
   using traits = Kokkos::ViewTraits<DataType, P...>;
 
+  using value_type   = typename traits::value_type;
+  using device_space = typename traits::memory_space;
+  using host_space =
+      typename Kokkos::Impl::HostMirror<device_space>::Space::memory_space;
+  using device_accessor = Impl::ChunkedArrayManager<device_space, value_type>;
+  using host_accessor   = Impl::ChunkedArrayManager<host_space, value_type>;
+
  private:
   template <class, class...>
   friend class DynamicView;
@@ -108,7 +280,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
                 "DynamicView only implemented for non-specialized View type");
 
   template <class Space, bool = Kokkos::Impl::MemorySpaceAccess<
-                             Space, typename traits::memory_space>::accessible>
+                             Space, device_space>::accessible>
   struct verify_space {
     KOKKOS_FORCEINLINE_FUNCTION static void check() {}
   };
@@ -123,9 +295,8 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
   };
 
  private:
-  track_type m_track;
-  typename traits::value_type** m_chunks =
-      nullptr;             // array of pointers to 'chunks' of memory
+  device_accessor m_chunks;
+  host_accessor m_chunks_host;
   unsigned m_chunk_shift;  // ceil(log2(m_chunk_size))
   unsigned m_chunk_mask;   // m_chunk_size - 1
   unsigned m_chunk_max;  // number of entries in the chunk array - each pointing
@@ -173,7 +344,8 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
 
   KOKKOS_INLINE_FUNCTION
   size_t allocation_extent() const noexcept {
-    uintptr_t n = *reinterpret_cast<const uintptr_t*>(m_chunks + m_chunk_max);
+    uintptr_t n =
+        *reinterpret_cast<const uintptr_t*>(m_chunks_host + m_chunk_max);
     return (n << m_chunk_shift);
   }
 
@@ -183,7 +355,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
   KOKKOS_INLINE_FUNCTION
   size_t size() const noexcept {
     size_t extent_0 =
-        *reinterpret_cast<const size_t*>(m_chunks + m_chunk_max + 1);
+        *reinterpret_cast<const size_t*>(m_chunks_host + m_chunk_max + 1);
     return extent_0;
   }
 
@@ -215,10 +387,10 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
   // Allocation tracking properties
 
   KOKKOS_INLINE_FUNCTION
-  int use_count() const { return m_track.use_count(); }
+  int use_count() const { return m_chunks_host.track().use_count(); }
 
   inline const std::string label() const {
-    return m_track.template get_label<typename traits::memory_space>();
+    return m_chunks_host.track().template get_label<host_space>();
   }
 
   //----------------------------------------------------------------------
@@ -285,13 +457,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
    *          up to the maximum number of chunks
    * */
   template <typename IntType>
-  inline typename std::enable_if<
-      std::is_integral<IntType>::value &&
-      Kokkos::Impl::MemorySpaceAccess<
-          Kokkos::HostSpace,
-          typename Impl::ChunkArraySpace<
-              typename traits::memory_space>::memory_space>::accessible>::type
-  resize_serial(IntType const& n) {
+  inline void resize_serial(IntType const& n) {
     using local_value_type   = typename traits::value_type;
     using value_pointer_type = local_value_type*;
 
@@ -304,37 +470,40 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
     }
 
     // *m_chunks[m_chunk_max] stores the current number of chunks being used
-    uintptr_t* const pc = reinterpret_cast<uintptr_t*>(m_chunks + m_chunk_max);
-    std::string _label =
-        m_track.template get_label<typename traits::memory_space>();
+    uintptr_t* const pc =
+        reinterpret_cast<uintptr_t*>(m_chunks_host + m_chunk_max);
+    std::string _label = m_chunks_host.track().template get_label<host_space>();
+
     if (*pc < NC) {
       while (*pc < NC) {
-        m_chunks[*pc] = reinterpret_cast<value_pointer_type>(
-            typename traits::memory_space().allocate(
+        m_chunks_host[*pc] =
+            reinterpret_cast<value_pointer_type>(device_space().allocate(
                 _label.c_str(), sizeof(local_value_type) << m_chunk_shift));
         ++*pc;
       }
     } else {
       while (NC + 1 <= *pc) {
         --*pc;
-        typename traits::memory_space().deallocate(
-            _label.c_str(), m_chunks[*pc],
-            sizeof(local_value_type) << m_chunk_shift);
-        m_chunks[*pc] = nullptr;
+        device_space().deallocate(_label.c_str(), m_chunks_host[*pc],
+                                  sizeof(local_value_type) << m_chunk_shift);
+        m_chunks_host[*pc] = nullptr;
       }
     }
-    // *m_chunks[m_chunk_max+1] stores the 'extent' requested by resize
+    // *m_chunks_host[m_chunk_max+1] stores the 'extent' requested by resize
     *(pc + 1) = n;
+
+    m_chunks_host.deep_copy_to(m_chunks);
   }
 
   KOKKOS_INLINE_FUNCTION bool is_allocated() const {
-    if (m_chunks == nullptr) {
-      return false;
-    } else {
-      // *m_chunks[m_chunk_max] stores the current number of chunks being used
+    if (m_chunks_host.valid()) {
+      // *m_chunks_host[m_chunk_max] stores the current number of chunks being
+      // used
       uintptr_t* const pc =
-          reinterpret_cast<uintptr_t*>(m_chunks + m_chunk_max);
+          reinterpret_cast<uintptr_t*>(m_chunks_host + m_chunk_max);
       return (*(pc + 1) > 0);
+    } else {
+      return false;
     }
   }
 
@@ -349,8 +518,8 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
 
   template <class RT, class... RP>
   DynamicView(const DynamicView<RT, RP...>& rhs)
-      : m_track(rhs.m_track),
-        m_chunks((typename traits::value_type**)rhs.m_chunks),
+      : m_chunks(rhs.m_chunks),
+        m_chunks_host(rhs.m_chunks_host),
         m_chunk_shift(rhs.m_chunk_shift),
         m_chunk_mask(rhs.m_chunk_mask),
         m_chunk_max(rhs.m_chunk_max),
@@ -361,63 +530,6 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
                   "Incompatible DynamicView copy construction");
   }
 
-  //----------------------------------------------------------------------
-
-  struct Destroy {
-    using local_value_type = typename traits::value_type;
-    std::string m_label;
-    local_value_type** m_chunks;
-    unsigned m_chunk_max;
-    bool m_destroy;
-    unsigned m_chunk_size;
-
-    // Initialize or destroy array of chunk pointers.
-    // Two entries beyond the max chunks are allocation counters.
-    inline void operator()(unsigned i) const {
-      if (m_destroy && i < m_chunk_max && nullptr != m_chunks[i]) {
-        typename traits::memory_space().deallocate(
-            m_label.c_str(), m_chunks[i],
-            sizeof(local_value_type) * m_chunk_size);
-      }
-      m_chunks[i] = nullptr;
-    }
-
-    void execute(bool arg_destroy) {
-      using Range = Kokkos::RangePolicy<typename HostSpace::execution_space>;
-
-      m_destroy = arg_destroy;
-
-      Kokkos::Impl::ParallelFor<Destroy, Range> closure(
-          *this,
-          Range(0, m_chunk_max + 2));  // Add 2 to 'destroy' extra slots storing
-                                       // num_chunks and extent; previously + 1
-
-      closure.execute();
-
-      typename traits::execution_space().fence();
-      // Impl::ChunkArraySpace< typename traits::memory_space
-      // >::memory_space::execution_space().fence();
-    }
-
-    void construct_shared_allocation() { execute(false); }
-
-    void destroy_shared_allocation() { execute(true); }
-
-    Destroy()               = default;
-    Destroy(Destroy&&)      = default;
-    Destroy(const Destroy&) = default;
-    Destroy& operator=(Destroy&&) = default;
-    Destroy& operator=(const Destroy&) = default;
-
-    Destroy(std::string label, typename traits::value_type** arg_chunk,
-            const unsigned arg_chunk_max, const unsigned arg_chunk_size)
-        : m_label(label),
-          m_chunks(arg_chunk),
-          m_chunk_max(arg_chunk_max),
-          m_destroy(false),
-          m_chunk_size(arg_chunk_size) {}
-  };
-
   /**\brief  Allocation constructor
    *
    *  Memory is allocated in chunks
@@ -427,10 +539,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
   explicit inline DynamicView(const std::string& arg_label,
                               const unsigned min_chunk_size,
                               const unsigned max_extent)
-      : m_track(),
-        m_chunks(nullptr)
-        // The chunk size is guaranteed to be a power of two
-        ,
+      :  // The chunk size is guaranteed to be a power of two
         m_chunk_shift(Kokkos::Impl::integral_power_of_two_that_contains(
             min_chunk_size))  // div ceil(log2(min_chunk_size))
         ,
@@ -440,28 +549,22 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
                     m_chunk_shift)  // max num pointers-to-chunks in array
         ,
         m_chunk_size(2 << (m_chunk_shift - 1)) {
-    using chunk_array_memory_space = typename Impl::ChunkArraySpace<
-        typename traits::memory_space>::memory_space;
-    // A functor to deallocate all of the chunks upon final destruction
-    using record_type =
-        Kokkos::Impl::SharedAllocationRecord<chunk_array_memory_space, Destroy>;
-
-    // Allocate chunk pointers and allocation counter
-    record_type* const record =
-        record_type::allocate(chunk_array_memory_space(), arg_label,
-                              (sizeof(pointer_type) * (m_chunk_max + 2)));
-    // Allocate + 2 extra slots so that *m_chunk[m_chunk_max] ==
-    // num_chunks_alloc and *m_chunk[m_chunk_max+1] == extent This must match in
-    // Destroy's execute(...) method
-
-    m_chunks = reinterpret_cast<pointer_type*>(record->data());
-
-    record->m_destroy = Destroy(arg_label, m_chunks, m_chunk_max, m_chunk_size);
+    m_chunks = device_accessor(m_chunk_max, m_chunk_size);
 
-    // Initialize to zero
-    record->m_destroy.construct_shared_allocation();
-
-    m_track.assign_allocated_record_to_uninitialized(record);
+    if (device_accessor::template IsAccessibleFrom<host_space>::value) {
+      m_chunks.template allocate_with_destroy<device_space>(arg_label);
+      m_chunks.initialize();
+      m_chunks_host =
+          device_accessor::template create_mirror<host_space>(m_chunks);
+    } else {
+      m_chunks.allocate_device(arg_label);
+      m_chunks_host =
+          device_accessor::template create_mirror<host_space>(m_chunks);
+      m_chunks_host.template allocate_with_destroy<device_space>(
+          arg_label, m_chunks.get_ptr());
+      m_chunks_host.initialize();
+      m_chunks_host.deep_copy_to(m_chunks);
+    }
   }
 };
 
@@ -487,8 +590,8 @@ inline void deep_copy(const View<T, DP...>& dst,
 
   enum {
     DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
   };
 
   if (DstExecCanAccessSrc) {
@@ -512,8 +615,8 @@ inline void deep_copy(const Kokkos::Experimental::DynamicView<T, DP...>& dst,
 
   enum {
     DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
   };
 
   if (DstExecCanAccessSrc) {
diff --git a/packages/kokkos/containers/src/Kokkos_ErrorReporter.hpp b/packages/kokkos/containers/src/Kokkos_ErrorReporter.hpp
index fbfaed9b1bcda2d22077947532f3abe303ea5533..18f026dc6ffcffc6c0b1884358ebf5a85012d40e 100644
--- a/packages/kokkos/containers/src/Kokkos_ErrorReporter.hpp
+++ b/packages/kokkos/containers/src/Kokkos_ErrorReporter.hpp
@@ -187,7 +187,8 @@ template <typename ReportType, typename DeviceType>
 void ErrorReporter<ReportType, DeviceType>::resize(const size_t new_size) {
   m_reports.resize(new_size);
   m_reporters.resize(new_size);
-  typename DeviceType::execution_space().fence();
+  typename DeviceType::execution_space().fence(
+      "Kokkos::Experimental::ErrorReporter::resize: fence after resizing");
 }
 
 }  // namespace Experimental
diff --git a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp
index 0f21a08ba3ba86ed176dc4c4535ef76c960e90bc..57bf745d4038de73b71654e518aa855e0faa1698 100644
--- a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp
@@ -116,8 +116,7 @@ KOKKOS_INLINE_FUNCTION void offsetview_verify_operator_bounds(
       This check should cover the case of Views that don't
       have the Unmanaged trait but were initialized by pointer. */
     if (tracker.has_record()) {
-      Kokkos::Impl::operator_bounds_error_on_device<MapType>(
-          map, Kokkos::Impl::has_printable_label_typedef<MapType>());
+      Kokkos::Impl::operator_bounds_error_on_device(map);
     } else {
       Kokkos::abort("OffsetView bounds error");
     }
@@ -1244,7 +1243,8 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
     // to avoid incomplete type errors from usng Kokkos::Cuda directly.
     if (std::is_same<Kokkos::CudaUVMSpace,
                      typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::OffsetView::OffsetView(): fence before UVM allocation");
     }
 #endif
     //------------------------------------------------------------
@@ -1256,7 +1256,8 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
 #if defined(KOKKOS_ENABLE_CUDA)
     if (std::is_same<Kokkos::CudaUVMSpace,
                      typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::OffsetView::OffsetView(): fence after UVM allocation");
     }
 #endif
     //------------------------------------------------------------
diff --git a/packages/kokkos/containers/src/Kokkos_ScatterView.hpp b/packages/kokkos/containers/src/Kokkos_ScatterView.hpp
index dcd4cf73e5d710bc427772a8a8de6384e80c9dae..79bc43b7393d85a1214e0ca3a8dc15861281e44e 100644
--- a/packages/kokkos/containers/src/Kokkos_ScatterView.hpp
+++ b/packages/kokkos/containers/src/Kokkos_ScatterView.hpp
@@ -834,7 +834,7 @@ class ScatterView<DataType, Layout, DeviceType, Op, ScatterNonDuplicated,
     static_assert(std::is_same<typename dest_type::array_layout, Layout>::value,
                   "ScatterView contribute destination has different layout");
     static_assert(
-        Kokkos::Impl::SpaceAccessibility<
+        Kokkos::SpaceAccessibility<
             execution_space, typename dest_type::memory_space>::accessible,
         "ScatterView contribute destination memory space not accessible");
     if (dest.data() == internal_view.data()) return;
@@ -1061,7 +1061,7 @@ class ScatterView<DataType, Kokkos::LayoutRight, DeviceType, Op,
                                Kokkos::LayoutRight>::value,
                   "ScatterView deep_copy destination has different layout");
     static_assert(
-        Kokkos::Impl::SpaceAccessibility<
+        Kokkos::SpaceAccessibility<
             execution_space, typename dest_type::memory_space>::accessible,
         "ScatterView deep_copy destination memory space not accessible");
     bool is_equal = (dest.data() == internal_view.data());
@@ -1290,7 +1290,7 @@ class ScatterView<DataType, Kokkos::LayoutLeft, DeviceType, Op,
                                Kokkos::LayoutLeft>::value,
                   "ScatterView deep_copy destination has different layout");
     static_assert(
-        Kokkos::Impl::SpaceAccessibility<
+        Kokkos::SpaceAccessibility<
             execution_space, typename dest_type::memory_space>::accessible,
         "ScatterView deep_copy destination memory space not accessible");
     auto extent   = internal_view.extent(internal_view_type::rank - 1);
diff --git a/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
index 81be3ee2d3e836436a23f8808a07f9386bc3ac05..cd633e40310177b116f04220c7030545ba37039d 100644
--- a/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
+++ b/packages/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
@@ -405,7 +405,9 @@ class StaticCrsGraph {
     Kokkos::parallel_for("Kokkos::StaticCrsGraph::create_block_partitioning",
                          Kokkos::RangePolicy<execution_space>(0, numRows()),
                          partitioner);
-    typename device_type::execution_space().fence();
+    typename device_type::execution_space().fence(
+        "Kokkos::StaticCrsGraph::create_block_partitioning:: fence after "
+        "partition");
 
     row_block_offsets = block_offsets;
   }
diff --git a/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp
index edb0e7261da93bb629cad4e9cc7c7d3118868288..a1601eee35869f5c26249dbf2ed325c4e84d5420 100644
--- a/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp
+++ b/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp
@@ -345,7 +345,8 @@ class UnorderedMap {
       const impl_value_type tmp = impl_value_type();
       Kokkos::deep_copy(m_values, tmp);
     }
-    { Kokkos::deep_copy(m_scalars, 0); }
+    Kokkos::deep_copy(m_scalars, 0);
+    m_size = 0;
   }
 
   KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const {
@@ -393,9 +394,9 @@ class UnorderedMap {
   ///
   /// This method has undefined behavior when erasable() is true.
   ///
-  /// Note that this is not a device function; it cannot be called in
+  /// Note that this is <i>not</i> a device function; it cannot be called in
   /// a parallel kernel.  The value is not stored as a variable; it
-  /// must be computed.
+  /// must be computed. m_size is a mutable cache of that value.
   size_type size() const {
     if (capacity() == 0u) return 0u;
     if (modified()) {
@@ -419,9 +420,13 @@ class UnorderedMap {
   bool begin_erase() {
     bool result = !erasable();
     if (is_insertable_map && result) {
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::UnorderedMap::begin_erase: fence before setting erasable "
+          "flag");
       set_flag(erasable_idx);
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::UnorderedMap::begin_erase: fence after setting erasable "
+          "flag");
     }
     return result;
   }
@@ -429,10 +434,12 @@ class UnorderedMap {
   bool end_erase() {
     bool result = erasable();
     if (is_insertable_map && result) {
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::UnorderedMap::end_erase: fence before erasing");
       Impl::UnorderedMapErase<declared_map_type> f(*this);
       f.apply();
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::UnorderedMap::end_erase: fence after erasing");
       reset_flag(erasable_idx);
     }
     return result;
diff --git a/packages/kokkos/containers/src/Kokkos_Vector.hpp b/packages/kokkos/containers/src/Kokkos_Vector.hpp
index a1fbba6b21c76b4bb7b2a63a4e3a863241a7cd74..88721bd89eb2fd86543c480727876a58fd888a56 100644
--- a/packages/kokkos/containers/src/Kokkos_Vector.hpp
+++ b/packages/kokkos/containers/src/Kokkos_Vector.hpp
@@ -119,12 +119,14 @@ class vector : public DualView<Scalar*, LayoutLeft, Arg1Type> {
     if (DV::template need_sync<typename DV::t_dev::device_type>()) {
       set_functor_host f(DV::h_view, val);
       parallel_for("Kokkos::vector::assign", n, f);
-      typename DV::t_host::execution_space().fence();
+      typename DV::t_host::execution_space().fence(
+          "Kokkos::vector::assign: fence after assigning values");
       DV::template modify<typename DV::t_host::device_type>();
     } else {
       set_functor f(DV::d_view, val);
       parallel_for("Kokkos::vector::assign", n, f);
-      typename DV::t_dev::execution_space().fence();
+      typename DV::t_dev::execution_space().fence(
+          "Kokkos::vector::assign: fence after assigning values");
       DV::template modify<typename DV::t_dev::device_type>();
     }
   }
diff --git a/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp b/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
index 6047e60f3dd080b8cfe456627ccc80266e7df66b..9512f2d4a20e509af321d315c8963693076a0d58 100644
--- a/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
+++ b/packages/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
@@ -57,22 +57,10 @@
 namespace Kokkos {
 namespace Impl {
 
-KOKKOS_FORCEINLINE_FUNCTION
-unsigned rotate_left(unsigned i, int r) {
-  constexpr int size = static_cast<int>(sizeof(unsigned) * CHAR_BIT);
-  return r ? ((i << r) | (i >> (size - r))) : i;
-}
-
 KOKKOS_FORCEINLINE_FUNCTION
 unsigned rotate_right(unsigned i, int r) {
   constexpr int size = static_cast<int>(sizeof(unsigned) * CHAR_BIT);
-  // FIXME_SYCL llvm.fshr.i32 missing
-  // (https://github.com/intel/llvm/issues/3308)
-#ifdef __SYCL_DEVICE_ONLY__
-  return rotate_left(i, size - r);
-#else
   return r ? ((i >> r) | (i << (size - r))) : i;
-#endif
 }
 
 template <typename Bitset>
diff --git a/packages/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp b/packages/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp
index 367ab338572064f167c3c50f447e4d27efff6999..fdd78e4e5f99dc4748c093d274c1e62f9316261a 100644
--- a/packages/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp
+++ b/packages/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp
@@ -75,7 +75,7 @@ uint32_t fmix32(uint32_t h) {
 
 KOKKOS_INLINE_FUNCTION
 uint32_t MurmurHash3_x86_32(const void* key, int len, uint32_t seed) {
-  const uint8_t* data = (const uint8_t*)key;
+  const uint8_t* data = static_cast<const uint8_t*>(key);
   const int nblocks   = len / 4;
 
   uint32_t h1 = seed;
diff --git a/packages/kokkos/containers/unit_tests/TestDualView.hpp b/packages/kokkos/containers/unit_tests/TestDualView.hpp
index 3eee85ed10bd81bc8b511afa9f0fbde7ba244b8f..e22564aa5c24e569ae98d972fe5526a35cc741a6 100644
--- a/packages/kokkos/containers/unit_tests/TestDualView.hpp
+++ b/packages/kokkos/containers/unit_tests/TestDualView.hpp
@@ -49,7 +49,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <cstdio>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <Kokkos_DualView.hpp>
 
 namespace Test {
diff --git a/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp b/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp
index dd0199ed81c75dcee42b964ac0bb1c246175ed01..a8d62bd24cad46531f2b4814f4d832c08758fe10 100644
--- a/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp
+++ b/packages/kokkos/containers/unit_tests/TestDynViewAPI.hpp
@@ -702,6 +702,11 @@ class TestDynViewAPI {
 
   using View0 = Kokkos::View<T, device>;
   using View1 = Kokkos::View<T*, device>;
+  using View2 = Kokkos::View<T**, device>;
+  using View3 = Kokkos::View<T***, device>;
+  using View4 = Kokkos::View<T****, device>;
+  using View5 = Kokkos::View<T*****, device>;
+  using View6 = Kokkos::View<T******, device>;
   using View7 = Kokkos::View<T*******, device>;
 
   using host_view_space = typename View0::host_mirror_space;
@@ -1065,7 +1070,7 @@ class TestDynViewAPI {
 
     dView0 d_uninitialized(
         Kokkos::view_alloc(Kokkos::WithoutInitializing, "uninit"), 10, 20);
-    ASSERT_TRUE(d_uninitialized.data() != nullptr);
+    ASSERT_NE(d_uninitialized.data(), nullptr);
     ASSERT_EQ(d_uninitialized.rank(), 2);
     ASSERT_EQ(d_uninitialized.extent(0), 10);
     ASSERT_EQ(d_uninitialized.extent(1), 20);
@@ -1075,14 +1080,14 @@ class TestDynViewAPI {
     hView0 hx, hy, hz;
 
     ASSERT_TRUE(Kokkos::is_dyn_rank_view<dView0>::value);
-    ASSERT_FALSE(Kokkos::is_dyn_rank_view<Kokkos::View<double> >::value);
-
-    ASSERT_TRUE(dx.data() == nullptr);  // Okay with UVM
-    ASSERT_TRUE(dy.data() == nullptr);  // Okay with UVM
-    ASSERT_TRUE(dz.data() == nullptr);  // Okay with UVM
-    ASSERT_TRUE(hx.data() == nullptr);
-    ASSERT_TRUE(hy.data() == nullptr);
-    ASSERT_TRUE(hz.data() == nullptr);
+    ASSERT_FALSE(Kokkos::is_dyn_rank_view<Kokkos::View<double>>::value);
+
+    ASSERT_EQ(dx.data(), nullptr);  // Okay with UVM
+    ASSERT_EQ(dy.data(), nullptr);  // Okay with UVM
+    ASSERT_EQ(dz.data(), nullptr);  // Okay with UVM
+    ASSERT_EQ(hx.data(), nullptr);
+    ASSERT_EQ(hy.data(), nullptr);
+    ASSERT_EQ(hz.data(), nullptr);
     ASSERT_EQ(dx.extent(0), 0u);  // Okay with UVM
     ASSERT_EQ(dy.extent(0), 0u);  // Okay with UVM
     ASSERT_EQ(dz.extent(0), 0u);  // Okay with UVM
@@ -1153,11 +1158,11 @@ class TestDynViewAPI {
 
     ASSERT_EQ(dx.use_count(), size_t(2));
 
-    ASSERT_FALSE(dx.data() == nullptr);
-    ASSERT_FALSE(const_dx.data() == nullptr);
-    ASSERT_FALSE(unmanaged_dx.data() == nullptr);
-    ASSERT_FALSE(unmanaged_from_ptr_dx.data() == nullptr);
-    ASSERT_FALSE(dy.data() == nullptr);
+    ASSERT_NE(dx.data(), nullptr);
+    ASSERT_NE(const_dx.data(), nullptr);
+    ASSERT_NE(unmanaged_dx.data(), nullptr);
+    ASSERT_NE(unmanaged_from_ptr_dx.data(), nullptr);
+    ASSERT_NE(dy.data(), nullptr);
     ASSERT_NE(dx, dy);
 
     ASSERT_EQ(dx.extent(0), unsigned(N0));
@@ -1317,17 +1322,17 @@ class TestDynViewAPI {
     ASSERT_NE(dx, dz);
 
     dx = dView0();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_FALSE(dy.data() == nullptr);
-    ASSERT_FALSE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_NE(dy.data(), nullptr);
+    ASSERT_NE(dz.data(), nullptr);
     dy = dView0();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_TRUE(dy.data() == nullptr);
-    ASSERT_FALSE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_EQ(dy.data(), nullptr);
+    ASSERT_NE(dz.data(), nullptr);
     dz = dView0();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_TRUE(dy.data() == nullptr);
-    ASSERT_TRUE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_EQ(dy.data(), nullptr);
+    ASSERT_EQ(dz.data(), nullptr);
 
     // View - DynRankView Interoperability tests
     // deep_copy from view to dynrankview
@@ -1367,7 +1372,7 @@ class TestDynViewAPI {
   static void check_auto_conversion_to_const(
       const Kokkos::DynRankView<const DataType, device>& arg_const,
       const Kokkos::DynRankView<DataType, device>& arg) {
-    ASSERT_TRUE(arg_const == arg);
+    ASSERT_EQ(arg_const, arg);
   }
 
   static void run_test_allocated() {
@@ -1396,8 +1401,8 @@ class TestDynViewAPI {
     const_typeX xc = x;
     const_typeR xr = x;
 
-    ASSERT_TRUE(xc == x);
-    ASSERT_TRUE(x == xc);
+    ASSERT_EQ(xc, x);
+    ASSERT_EQ(x, xc);
 
     // For CUDA the constant random access View does not return
     // an lvalue reference due to retrieving through texture cache
@@ -1406,7 +1411,7 @@ class TestDynViewAPI {
     if (!std::is_same<typename device::execution_space, Kokkos::Cuda>::value)
 #endif
     {
-      ASSERT_TRUE(x.data() == xr.data());
+      ASSERT_EQ(x.data(), xr.data());
     }
 
     // typeX xf = xc ; // setting non-const from const must not compile
@@ -1659,29 +1664,29 @@ class TestDynViewAPI {
     const_svector_right_type cvr3 =
         Kokkos::subdynrankview(mv, Kokkos::ALL(), 2);
 
-    ASSERT_TRUE(&v1[0] == &v1(0));
-    ASSERT_TRUE(&v1[0] == &mv(0, 0));
-    ASSERT_TRUE(&v2[0] == &mv(0, 1));
-    ASSERT_TRUE(&v3[0] == &mv(0, 2));
-
-    ASSERT_TRUE(&cv1[0] == &mv(0, 0));
-    ASSERT_TRUE(&cv2[0] == &mv(0, 1));
-    ASSERT_TRUE(&cv3[0] == &mv(0, 2));
-
-    ASSERT_TRUE(&vr1[0] == &mv(0, 0));
-    ASSERT_TRUE(&vr2[0] == &mv(0, 1));
-    ASSERT_TRUE(&vr3[0] == &mv(0, 2));
-
-    ASSERT_TRUE(&cvr1[0] == &mv(0, 0));
-    ASSERT_TRUE(&cvr2[0] == &mv(0, 1));
-    ASSERT_TRUE(&cvr3[0] == &mv(0, 2));
-
-    ASSERT_TRUE(&mv1(0, 0) == &mv(1, 2));
-    ASSERT_TRUE(&mv1(1, 1) == &mv(2, 3));
-    ASSERT_TRUE(&mv1(3, 2) == &mv(4, 4));
-    ASSERT_TRUE(&mvr1(0, 0) == &mv_right(1, 2));
-    ASSERT_TRUE(&mvr1(1, 1) == &mv_right(2, 3));
-    ASSERT_TRUE(&mvr1(3, 2) == &mv_right(4, 4));
+    ASSERT_EQ(&v1[0], &v1(0));
+    ASSERT_EQ(&v1[0], &mv(0, 0));
+    ASSERT_EQ(&v2[0], &mv(0, 1));
+    ASSERT_EQ(&v3[0], &mv(0, 2));
+
+    ASSERT_EQ(&cv1[0], &mv(0, 0));
+    ASSERT_EQ(&cv2[0], &mv(0, 1));
+    ASSERT_EQ(&cv3[0], &mv(0, 2));
+
+    ASSERT_EQ(&vr1[0], &mv(0, 0));
+    ASSERT_EQ(&vr2[0], &mv(0, 1));
+    ASSERT_EQ(&vr3[0], &mv(0, 2));
+
+    ASSERT_EQ(&cvr1[0], &mv(0, 0));
+    ASSERT_EQ(&cvr2[0], &mv(0, 1));
+    ASSERT_EQ(&cvr3[0], &mv(0, 2));
+
+    ASSERT_EQ(&mv1(0, 0), &mv(1, 2));
+    ASSERT_EQ(&mv1(1, 1), &mv(2, 3));
+    ASSERT_EQ(&mv1(3, 2), &mv(4, 4));
+    ASSERT_EQ(&mvr1(0, 0), &mv_right(1, 2));
+    ASSERT_EQ(&mvr1(1, 1), &mv_right(2, 3));
+    ASSERT_EQ(&mvr1(3, 2), &mv_right(4, 4));
 
     const_svector_type c_cv1(v1);
     typename svector_type::const_type c_cv2(v2);
diff --git a/packages/kokkos/containers/unit_tests/TestDynamicView.hpp b/packages/kokkos/containers/unit_tests/TestDynamicView.hpp
index f018793dd6f3b162acbf9db20174c47ac75fc1c0..023bf92f62b48bc46878209e6c5ef6eccedeb726 100644
--- a/packages/kokkos/containers/unit_tests/TestDynamicView.hpp
+++ b/packages/kokkos/containers/unit_tests/TestDynamicView.hpp
@@ -52,7 +52,7 @@
 #include <Kokkos_Core.hpp>
 
 #include <Kokkos_DynamicView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 namespace Test {
 
diff --git a/packages/kokkos/containers/unit_tests/TestOffsetView.hpp b/packages/kokkos/containers/unit_tests/TestOffsetView.hpp
index 9ddc226e291f6e7dc7d6bc960fad470fafeb9974..24a43e1ebc72820dbd84dd6e2931837cabfaecba 100644
--- a/packages/kokkos/containers/unit_tests/TestOffsetView.hpp
+++ b/packages/kokkos/containers/unit_tests/TestOffsetView.hpp
@@ -50,7 +50,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <cstdio>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <Kokkos_OffsetView.hpp>
 #include <KokkosExp_MDRangePolicy.hpp>
 
diff --git a/packages/kokkos/containers/unit_tests/TestScatterView.hpp b/packages/kokkos/containers/unit_tests/TestScatterView.hpp
index fdbce2d492009cf38d5491398d77423108edc6a5..342ce2af48afe2cba3737db653f67957d04a51d4 100644
--- a/packages/kokkos/containers/unit_tests/TestScatterView.hpp
+++ b/packages/kokkos/containers/unit_tests/TestScatterView.hpp
@@ -118,11 +118,51 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
       scatter_access(k, 3)++;
       scatter_access(k, 4)--;
       scatter_access(k, 5) -= 5;
+// Workaround Intel 17 compiler bug which sometimes add random
+// instruction alignment which makes the lock instruction
+// illegal. Seems to be mostly just for unsigned int atomics.
+// Looking at the assembly the compiler
+// appears to insert cache line alignment for the instruction.
+// Isn't restricted to specific archs. Seen it on SNB and SKX, but for
+// different code. Another occurrence was with Desul atomics in
+// a different unit test. This one here happens without desul atomics.
+// Inserting an assembly nop instruction changes the alignment and
+// works round this.
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       scatter_access_atomic(k, 6) += 2;
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       scatter_access_atomic(k, 7)++;
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       scatter_access_atomic(k, 8)--;
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       --scatter_access_atomic(k, 9);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       ++scatter_access_atomic(k, 10);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       scatter_access(k, 11) -= 3;
     }
   }
diff --git a/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp b/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
index a9a178f95e7b7fedabcb7b00b292d88603ff3f77..c9a3eed90c372fcd4211d0a46868fe8bcc061614 100644
--- a/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
+++ b/packages/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
@@ -180,8 +180,6 @@ void run_test_graph3(size_t B, size_t N) {
 
   std::vector<size_t> sizes(LENGTH);
 
-  size_t total_length = 0;
-
   for (size_t i = 0; i < LENGTH; ++i) {
     sizes[i] = rand() % 1000;
   }
@@ -189,10 +187,6 @@ void run_test_graph3(size_t B, size_t N) {
   sizes[1]    = N;
   sizes[1998] = N;
 
-  for (size_t i = 0; i < LENGTH; ++i) {
-    total_length += sizes[i];
-  }
-
   int C    = 0;
   dView dx = Kokkos::create_staticcrsgraph<dView>("test", sizes);
   dx.create_block_partitioning(B, C);
diff --git a/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp b/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp
index 4413cfbc80e31271d1e2b830976796ade24aaa9a..8009b996566322147bcd5cfe257dd858b72819bb 100644
--- a/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp
+++ b/packages/kokkos/containers/unit_tests/TestUnorderedMap.hpp
@@ -295,10 +295,8 @@ void test_deep_copy(uint32_t num_nodes) {
 }
 
 // FIXME_SYCL wrong results on Nvidia GPUs but correct on Host and Intel GPUs
-// FIXME_HIP
 // WORKAROUND MSVC
-#if !(defined(KOKKOS_ENABLE_HIP) && (HIP_VERSION < 401)) && \
-    !defined(_WIN32) && !defined(KOKKOS_ENABLE_SYCL)
+#if !defined(_WIN32) && !defined(KOKKOS_ENABLE_SYCL)
 TEST(TEST_CATEGORY, UnorderedMap_insert) {
   for (int i = 0; i < 500; ++i) {
     test_insert<TEST_EXECSPACE>(100000, 90000, 100, true);
@@ -329,6 +327,23 @@ TEST(TEST_CATEGORY, UnorderedMap_valid_empty) {
   ASSERT_TRUE(n.is_allocated());
 }
 
+TEST(TEST_CATEGORY, UnorderedMap_clear_zero_size) {
+  using Map =
+      Kokkos::UnorderedMap<int, void, Kokkos::DefaultHostExecutionSpace>;
+
+  Map m(11);
+  ASSERT_EQ(0u, m.size());
+
+  m.insert(2);
+  m.insert(3);
+  m.insert(5);
+  m.insert(7);
+  ASSERT_EQ(4u, m.size());
+
+  m.clear();
+  ASSERT_EQ(0u, m.size());
+}
+
 }  // namespace Test
 
 #endif  // KOKKOS_TEST_UNORDERED_MAP_HPP
diff --git a/packages/kokkos/core/cmake/KokkosCore_config.h.in b/packages/kokkos/core/cmake/KokkosCore_config.h.in
deleted file mode 100644
index f0835772b864faf0126796c75f5f1e9d02f95e28..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/cmake/KokkosCore_config.h.in
+++ /dev/null
@@ -1,104 +0,0 @@
-/* The trivial 'src/build_common.sh' creates a config
- * that must stay in sync with this file.
- */
-#cmakedefine KOKKOS_FOR_SIERRA
-
-#if !defined(KOKKOS_FOR_SIERRA)
-
-#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
-#error \
-    "Don't include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
-#else
-#define KOKKOS_CORE_CONFIG_H
-#endif
-
-#cmakedefine KOKKOS_ENABLE_CUDA
-#cmakedefine KOKKOS_ENABLE_HIP
-#cmakedefine KOKKOS_ENABLE_OPENMP
-#cmakedefine KOKKOS_ENABLE_THREADS
-#cmakedefine KOKKOS_ENABLE_SERIAL
-#cmakedefine KOKKOS_ENABLE_Winthread
-
-#cmakedefine KOKKOS_ENABLE_HWLOC
-#cmakedefine KOKKOS_ENABLE_HBWSPACE
-#cmakedefine KOKKOS_ENABLE_LIBRT
-
-#cmakedefine KOKKOS_ENABLE_DEBUG
-#cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
-#cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
-#cmakedefine KOKKOS_ENABLE_PROFILING_LOAD_PRINT
-#cmakedefine KOKKOS_ENABLE_TUNING
-
-#cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
-
-#ifdef KOKKOS_ENABLE_CUDA
-
-#cmakedefine KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
-
-// mfh 16 Sep 2014: If passed in on the command line, that overrides
-// any value of KOKKOS_USE_CUDA_UVM here.  Doing this should prevent build
-// warnings like this one:
-//
-// packages/kokkos/core/src/KokkosCore_config.h:13:1: warning:
-// "KOKKOS_USE_CUDA_UVM" redefined
-//
-// At some point, we should edit the test-build scripts in
-// Trilinos/cmake/ctest/drivers/perseus/, and take
-// -DKOKKOS_USE_CUDA_UVM from the command-line arguments there.  I
-// hesitate to do that now, because I'm not sure if all the files are
-// including KokkosCore_config.h (or a header file that includes it) like
-// they should.
-#ifndef KOKKOS_USE_CUDA_UVM
-#cmakedefine KOKKOS_USE_CUDA_UVM
-#endif
-
-#cmakedefine KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-
-#cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA
-
-#endif
-
-#cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND
-
-#ifndef __CUDA_ARCH__
-#cmakedefine KOKKOS_ENABLE_ISA_X86_64
-#cmakedefine KOKKOS_ENABLE_ISA_KNC
-#cmakedefine KOKKOS_ENABLE_ISA_POWERPCLE
-#endif
-
-#ifdef KOKKOS_ENABLE_HIP
-#cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
-#endif
-
-#cmakedefine KOKKOS_ARCH_ARMV80 1
-#cmakedefine KOKKOS_ARCH_ARMV81 1
-#cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX 1
-#cmakedefine KOKKOS_ARCH_AVX 1
-#cmakedefine KOKKOS_ARCH_AVX2 1
-#cmakedefine KOKKOS_ARCH_AVX512MIC 1
-#cmakedefine KOKKOS_ARCH_AVX512XEON 1
-#cmakedefine KOKKOS_ARCH_KNC 1
-#cmakedefine KOKKOS_ARCH_POWER8 1
-#cmakedefine KOKKOS_ARCH_POWER9 1
-#cmakedefine KOKKOS_ARCH_KEPLER 1
-#cmakedefine KOKKOS_ARCH_KEPLER30 1
-#cmakedefine KOKKOS_ARCH_KEPLER32 1
-#cmakedefine KOKKOS_ARCH_KEPLER35 1
-#cmakedefine KOKKOS_ARCH_KEPLER37 1
-#cmakedefine KOKKOS_ARCH_MAXWELL 1
-#cmakedefine KOKKOS_ARCH_MAXWELL50 1
-#cmakedefine KOKKOS_ARCH_MAXWELL52 1
-#cmakedefine KOKKOS_ARCH_MAXWELL53 1
-#cmakedefine KOKKOS_ARCH_PASCAL 1
-#cmakedefine KOKKOS_ARCH_PASCAL60 1
-#cmakedefine KOKKOS_ARCH_PASCAL61 1
-#cmakedefine KOKKOS_ARCH_VOLTA70 1
-
-// TODO: These are currently not used in Kokkos.  Should they be removed?
-#cmakedefine KOKKOS_ENABLE_MPI
-#cmakedefine KOKKOS_ENABLE_CUSPARSE
-
-// TODO: No longer options in Kokkos.  Need to be removed.
-#cmakedefine KOKKOS_USING_DEPRECATED_VIEW
-
-#endif  // !defined(KOKKOS_FOR_SIERRA)
diff --git a/packages/kokkos/core/perf_test/CMakeLists.txt b/packages/kokkos/core/perf_test/CMakeLists.txt
index 9ff4b6006da8cb0358f2a9e53810b79ce59e8b02..a7c57a94346d74db97c8c320e0f3669bb2cc68cc 100644
--- a/packages/kokkos/core/perf_test/CMakeLists.txt
+++ b/packages/kokkos/core/perf_test/CMakeLists.txt
@@ -10,9 +10,7 @@
 #INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src")
 
 # FIXME_OPENMPTARGET - the NVIDIA HPC compiler nvc++ in the OpenMPTarget backend does not pass the perf_tests.
-IF (KOKKOS_ENABLE_OPENMPTARGET
-    AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI
-         OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
+IF (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
   RETURN()
 ENDIF()
 
diff --git a/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp b/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
index dee21fd7a575bd5aa0f6838980c670510f475cab..b534c32c52c691f4c65c5442d89a9381516391f1 100644
--- a/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
+++ b/packages/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
@@ -231,7 +231,7 @@ void run_test_gramschmidt(int exp_beg, int exp_end, int num_trials,
 
     std::cout << label_gramschmidt << " , " << parallel_work_length << " , "
               << min_seconds << " , " << (min_seconds / parallel_work_length)
-              << std::endl;
+              << ", " << avg_seconds << std::endl;
   }
 }
 
diff --git a/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp b/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp
index c431c2b0c86d30192edc63d7dfbc447887f227cf..24c1898e0a16a4149c21df08e82c3da54ef0a25a 100644
--- a/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp
+++ b/packages/kokkos/core/perf_test/PerfTestHexGrad.cpp
@@ -280,7 +280,7 @@ void run_test_hexgrad(int exp_beg, int exp_end, int num_trials,
 
     std::cout << label_hexgrad << " , " << parallel_work_length << " , "
               << min_seconds << " , " << (min_seconds / parallel_work_length)
-              << std::endl;
+              << avg_seconds << std::endl;
   }
 }
 
diff --git a/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp b/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
index 50bbc78a6b75815ad59ea73c0077dc27ae2dccfa..5b7c2a7a03907f8f0c854482c06ef155441c097d 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
@@ -205,7 +205,7 @@ TEST(default_exec, overlap_range_policy) {
   double time_end = timer.seconds();
 
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE((time_end > 1.5 * time_overlap));
+    ASSERT_GT(time_end, 1.5 * time_overlap);
   }
   printf("Time RangePolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end,
          time_overlap);
@@ -238,7 +238,7 @@ TEST(default_exec, overlap_range_policy) {
   double time_not_fenced = timer.seconds();
   Kokkos::fence();
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_fenced > 2.0 * time_not_fenced);
+    ASSERT_GT(time_fenced, 2.0 * time_not_fenced);
   }
 
   timer.reset();
@@ -280,7 +280,7 @@ TEST(default_exec, overlap_range_policy) {
   ASSERT_EQ(h_result2(), h_result());
 
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce);
+    ASSERT_LT(time_overlapped_reduce, 1.5 * time_no_overlapped_reduce);
   }
   printf("Time RangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",
          time_no_overlapped_reduce, time_overlapped_reduce);
@@ -378,7 +378,7 @@ TEST(default_exec, overlap_mdrange_policy) {
   double time_end = timer.seconds();
 
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE((time_end > 1.5 * time_overlap));
+    ASSERT_GT(time_end, 1.5 * time_overlap);
   }
   printf("Time MDRangePolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end,
          time_overlap);
@@ -413,7 +413,7 @@ TEST(default_exec, overlap_mdrange_policy) {
   double time_not_fenced = timer.seconds();
   Kokkos::fence();
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_fenced > 2.0 * time_not_fenced);
+    ASSERT_GT(time_fenced, 2.0 * time_not_fenced);
   }
 
   timer.reset();
@@ -459,7 +459,7 @@ TEST(default_exec, overlap_mdrange_policy) {
   ASSERT_EQ(h_result2(), h_result());
 
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce);
+    ASSERT_LT(time_overlapped_reduce, 1.5 * time_no_overlapped_reduce);
   }
   printf("Time MDRangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",
          time_no_overlapped_reduce, time_overlapped_reduce);
@@ -548,7 +548,7 @@ TEST(default_exec, overlap_team_policy) {
   double time_end = timer.seconds();
 
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE((time_end > 1.5 * time_overlap));
+    ASSERT_GT(time_end, 1.5 * time_overlap);
   }
   printf("Time TeamPolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end,
          time_overlap);
@@ -581,7 +581,7 @@ TEST(default_exec, overlap_team_policy) {
   double time_not_fenced = timer.seconds();
   Kokkos::fence();
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_fenced > 2.0 * time_not_fenced);
+    ASSERT_GT(time_fenced, 2.0 * time_not_fenced);
   }
   timer.reset();
   Kokkos::parallel_reduce(
@@ -622,7 +622,7 @@ TEST(default_exec, overlap_team_policy) {
   ASSERT_EQ(h_result2(), h_result());
 
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce);
+    ASSERT_LT(time_overlapped_reduce, 1.5 * time_no_overlapped_reduce);
   }
   printf("Time TeamPolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",
          time_no_overlapped_reduce, time_overlapped_reduce);
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp
index 550316bec997121a58a8b44f6df8efdced16a623..555a05ea279cd2280c510a03976dd75e8ee171f2 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp
@@ -120,7 +120,8 @@ void run_allocateview_tests(int N, int R) {
   {
     Kokkos::Timer timer;
     for (int r = 0; r < R; r++) {
-      double* a_ptr = (double*)Kokkos::kokkos_malloc("A", sizeof(double) * N8);
+      double* a_ptr =
+          static_cast<double*>(Kokkos::kokkos_malloc("A", sizeof(double) * N8));
       Kokkos::parallel_for(
           N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 0.0; });
       Kokkos::fence();
diff --git a/packages/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp b/packages/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp
index afeeb643569ecd1a981132bed08944288ec3ca72..b0562f2fd12227f1e3270c332cd3f1a6c4298fad 100644
--- a/packages/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp
+++ b/packages/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp
@@ -47,10 +47,18 @@
 namespace Test {
 
 TEST(default_exec, ViewResize_Rank8) {
+// FIXME_SYCL Avoid running out of resources on the CUDA GPU used in the CI
+#ifdef KOKKOS_ENABLE_SYCL
+  printf("Resize View Performance for LayoutLeft:\n");
+  run_resizeview_tests8<Kokkos::LayoutLeft>(9, 1);
+  printf("Resize View Performance for LayoutRight:\n");
+  run_resizeview_tests8<Kokkos::LayoutRight>(9, 1);
+#else
   printf("Resize View Performance for LayoutLeft:\n");
   run_resizeview_tests8<Kokkos::LayoutLeft>(10, 1);
   printf("Resize View Performance for LayoutRight:\n");
   run_resizeview_tests8<Kokkos::LayoutRight>(10, 1);
+#endif
 }
 
 }  // namespace Test
diff --git a/packages/kokkos/core/perf_test/test_atomic.cpp b/packages/kokkos/core/perf_test/test_atomic.cpp
index 59820f3bdd2e83291dfd524325d3b7be6ba918ef..54824e5b39b91456a81e541e39a441d20b2879a7 100644
--- a/packages/kokkos/core/perf_test/test_atomic.cpp
+++ b/packages/kokkos/core/perf_test/test_atomic.cpp
@@ -47,7 +47,7 @@
 #include <cstdlib>
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 using exec_space = Kokkos::DefaultExecutionSpace;
 
@@ -401,7 +401,7 @@ template <class T>
 void Loop(int loop, int test, const char* type_name) {
   LoopVariant<T>(loop, test);
 
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
   T res       = LoopVariant<T>(loop, test);
   double time = timer.seconds();
 
diff --git a/packages/kokkos/core/perf_test/test_atomic_minmax_simple.cpp b/packages/kokkos/core/perf_test/test_atomic_minmax_simple.cpp
index eec1c8eacc7779121fd54e23d1ad7e4efa80902c..4086ef58163ff4ec144d2c151241952752d3453d 100644
--- a/packages/kokkos/core/perf_test/test_atomic_minmax_simple.cpp
+++ b/packages/kokkos/core/perf_test/test_atomic_minmax_simple.cpp
@@ -12,13 +12,13 @@
 #include <typeinfo>
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 using exec_space = Kokkos::DefaultExecutionSpace;
 
 template <typename T>
 void test(const int length) {
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
 
   using vector = Kokkos::View<T*, exec_space>;
 
diff --git a/packages/kokkos/core/perf_test/test_mempool.cpp b/packages/kokkos/core/perf_test/test_mempool.cpp
index 9aab119774c49d99ec112c527c79364a9c02ddc6..7887d4ba55196ddbc283baa023884e2f43a48991 100644
--- a/packages/kokkos/core/perf_test/test_mempool.cpp
+++ b/packages/kokkos/core/perf_test/test_mempool.cpp
@@ -48,7 +48,7 @@
 #include <limits>
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 using ExecSpace   = Kokkos::DefaultExecutionSpace;
 using MemorySpace = Kokkos::DefaultExecutionSpace::memory_space;
@@ -100,7 +100,7 @@ struct TestFunctor {
 
       const unsigned size_alloc = chunk * (1 + (j % chunk_span));
 
-      ptrs(j) = (uintptr_t)pool.allocate(size_alloc);
+      ptrs(j) = reinterpret_cast<uintptr_t>(pool.allocate(size_alloc));
 
       if (ptrs(j)) ++update;
     }
@@ -129,7 +129,7 @@ struct TestFunctor {
 
       const unsigned size_alloc = chunk * (1 + (j % chunk_span));
 
-      pool.deallocate((void*)ptrs(j), size_alloc);
+      pool.deallocate(reinterpret_cast<void*>(ptrs(j)), size_alloc);
     }
   }
 
@@ -153,9 +153,9 @@ struct TestFunctor {
         for (unsigned k = 0; k < repeat_inner; ++k) {
           const unsigned size_alloc = chunk * (1 + (j % chunk_span));
 
-          pool.deallocate((void*)ptrs(j), size_alloc);
+          pool.deallocate(reinterpret_cast<void*>(ptrs(j)), size_alloc);
 
-          ptrs(j) = (uintptr_t)pool.allocate(size_alloc);
+          ptrs(j) = reinterpret_cast<uintptr_t>(pool.allocate(size_alloc));
 
           if (0 == ptrs(j)) update++;
         }
@@ -266,7 +266,7 @@ int main(int argc, char* argv[]) {
     TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc,
                         fill_stride, chunk_span, repeat_inner);
 
-    Kokkos::Impl::Timer timer;
+    Kokkos::Timer timer;
 
     if (!functor.test_fill()) {
       Kokkos::abort("fill ");
diff --git a/packages/kokkos/core/perf_test/test_taskdag.cpp b/packages/kokkos/core/perf_test/test_taskdag.cpp
index b2f936a955eca4a6d8a3c0eec928e01c5de66e51..49957ae9323db825ebcfa6d3c19dddb38856862d 100644
--- a/packages/kokkos/core/perf_test/test_taskdag.cpp
+++ b/packages/kokkos/core/perf_test/test_taskdag.cpp
@@ -56,7 +56,7 @@ int main() { return 0; }
 #include <cstdlib>
 #include <limits>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 using ExecSpace = Kokkos::DefaultExecutionSpace;
 
@@ -220,7 +220,7 @@ int main(int argc, char* argv[]) {
     double time_sum = 0;
 
     for (int i = 0; i < test_repeat_outer; ++i) {
-      Kokkos::Impl::Timer timer;
+      Kokkos::Timer timer;
 
       Functor::FutureType ftmp =
           Kokkos::host_spawn(Kokkos::TaskSingle(sched), Functor(fib_input));
diff --git a/packages/kokkos/core/src/CMakeLists.txt b/packages/kokkos/core/src/CMakeLists.txt
index 2ab0989805723ce32115d379dd39708b5edd8209..499736c60d55b7746682f8828a9af45fc6c0aa8b 100644
--- a/packages/kokkos/core/src/CMakeLists.txt
+++ b/packages/kokkos/core/src/CMakeLists.txt
@@ -9,6 +9,8 @@ INSTALL (DIRECTORY
   "${CMAKE_CURRENT_SOURCE_DIR}/"
   DESTINATION ${KOKKOS_HEADER_DIR}
   FILES_MATCHING
+  PATTERN "*.inc"
+  PATTERN "*.inc_*"
   PATTERN "*.hpp"
   PATTERN "*.h"
 )
@@ -65,6 +67,15 @@ IF (KOKKOS_ENABLE_SYCL)
   APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.hpp)
 ENDIF()
 
+IF (KOKKOS_ENABLE_IMPL_DESUL_ATOMICS)
+  APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/desul/src/*.cpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*.hpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*/*.hpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*/*/*.hpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*/*/*.inc)
+ENDIF()
+
+
 KOKKOS_ADD_LIBRARY(
   kokkoscore
   SOURCES ${KOKKOS_CORE_SRCS}
@@ -86,3 +97,15 @@ KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBDL)
 KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBRT)
 KOKKOS_LINK_TPL(kokkoscore PUBLIC PTHREAD)
 KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM)
+
+# FIXME: We need a proper solution to figure out whether to enable
+#        libatomic
+# XL requires libatomic even for 64 bit CAS, most others only for 128
+# I (CT) had removed 128bit CAS from desul to not need libatomic.
+IF (KOKKOS_ENABLE_IMPL_DESUL_ATOMICS AND
+    (KOKKOS_ENABLE_OPENMPTARGET OR (CMAKE_CXX_COMPILER_ID STREQUAL XLClang)))
+  target_link_libraries(kokkoscore PUBLIC atomic)
+ENDIF()
+
+
+KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBQUADMATH)
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
index 916f109758de4ba3cf469659d7458ae77cf464da..f6b276240316ec1e6edc332d680eb72853c980b1 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@@ -90,43 +90,25 @@ static std::atomic<int> num_uvm_allocations(0);
 
 }  // namespace
 
-DeepCopy<CudaSpace, CudaSpace, Cuda>::DeepCopy(void *dst, const void *src,
-                                               size_t n) {
-  CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
+void DeepCopyCuda(void *dst, const void *src, size_t n) {
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
 }
 
-DeepCopy<HostSpace, CudaSpace, Cuda>::DeepCopy(void *dst, const void *src,
-                                               size_t n) {
-  CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
-}
-
-DeepCopy<CudaSpace, HostSpace, Cuda>::DeepCopy(void *dst, const void *src,
-                                               size_t n) {
-  CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
-}
-
-DeepCopy<CudaSpace, CudaSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst,
-                                               const void *src, size_t n) {
-  CUDA_SAFE_CALL(
-      cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream()));
-}
-
-DeepCopy<HostSpace, CudaSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst,
-                                               const void *src, size_t n) {
-  CUDA_SAFE_CALL(
-      cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream()));
-}
-
-DeepCopy<CudaSpace, HostSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst,
-                                               const void *src, size_t n) {
-  CUDA_SAFE_CALL(
+void DeepCopyAsyncCuda(const Cuda &instance, void *dst, const void *src,
+                       size_t n) {
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
       cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream()));
 }
 
 void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) {
   cudaStream_t s = cuda_get_deep_copy_stream();
-  CUDA_SAFE_CALL(cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, s));
-  cudaStreamSynchronize(s);
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
+      cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, s));
+  Impl::cuda_stream_synchronize(
+      s,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          DeepCopyResourceSynchronization,
+      "Kokkos::Impl::DeepCopyAsyncCuda: Deep Copy Stream Sync");
 }
 
 }  // namespace Impl
@@ -137,6 +119,7 @@ void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) {
 
 namespace Kokkos {
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 KOKKOS_DEPRECATED void CudaSpace::access_error() {
   const std::string msg(
       "Kokkos::CudaSpace::access_error attempt to execute Cuda function from "
@@ -150,6 +133,7 @@ KOKKOS_DEPRECATED void CudaSpace::access_error(const void *const) {
       "non-Cuda space");
   Kokkos::Impl::throw_runtime_exception(msg);
 }
+#endif
 
 /*--------------------------------------------------------------------------*/
 
@@ -164,9 +148,11 @@ bool CudaUVMSpace::available() {
 
 /*--------------------------------------------------------------------------*/
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 int CudaUVMSpace::number_of_allocations() {
   return Kokkos::Impl::num_uvm_allocations.load();
 }
+#endif
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
 // The purpose of the following variable is to allow a state-based choice
 // for pinning UVM allocations to the CPU. For now this is considered
@@ -204,6 +190,8 @@ CudaUVMSpace::CudaUVMSpace() : m_device(Kokkos::Cuda().cuda_device()) {}
 
 CudaHostPinnedSpace::CudaHostPinnedSpace() {}
 
+int memory_threshold_g = 40000;  // 40 kB
+
 //==============================================================================
 // <editor-fold desc="allocate()"> {{{1
 
@@ -221,7 +209,19 @@ void *CudaSpace::impl_allocate(
     const Kokkos::Tools::SpaceHandle arg_handle) const {
   void *ptr = nullptr;
 
+#ifndef CUDART_VERSION
+#error CUDART_VERSION undefined!
+#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
+  cudaError_t error_code;
+  if (arg_alloc_size >= memory_threshold_g) {
+    error_code = cudaMallocAsync(&ptr, arg_alloc_size, 0);
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  } else {
+    error_code = cudaMalloc(&ptr, arg_alloc_size);
+  }
+#else
   auto error_code = cudaMalloc(&ptr, arg_alloc_size);
+#endif
   if (error_code != cudaSuccess) {  // TODO tag as unlikely branch
     cudaGetLastError();  // This is the only way to clear the last error, which
                          // we should do here since we're turning it into an
@@ -253,7 +253,8 @@ void *CudaUVMSpace::impl_allocate(
     const Kokkos::Tools::SpaceHandle arg_handle) const {
   void *ptr = nullptr;
 
-  Cuda::impl_static_fence();
+  Cuda::impl_static_fence(
+      "Kokkos::CudaUVMSpace::impl_allocate: Pre UVM Allocation");
   if (arg_alloc_size > 0) {
     Kokkos::Impl::num_uvm_allocations++;
 
@@ -276,7 +277,8 @@ void *CudaUVMSpace::impl_allocate(
               CudaMallocManaged);
     }
   }
-  Cuda::impl_static_fence();
+  Cuda::impl_static_fence(
+      "Kokkos::CudaUVMSpace::impl_allocate: Post UVM Allocation");
   if (Kokkos::Profiling::profileLibraryLoaded()) {
     const size_t reported_size =
         (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
@@ -337,9 +339,20 @@ void CudaSpace::impl_deallocate(
     Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
                                       reported_size);
   }
-
   try {
-    CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+#ifndef CUDART_VERSION
+#error CUDART_VERSION undefined!
+#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
+    if (arg_alloc_size >= memory_threshold_g) {
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeAsync(arg_alloc_ptr, 0));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    } else {
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+    }
+#else
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+#endif
   } catch (...) {
   }
 }
@@ -362,7 +375,8 @@ void CudaUVMSpace::impl_deallocate(
     ,
     const size_t arg_logical_size,
     const Kokkos::Tools::SpaceHandle arg_handle) const {
-  Cuda::impl_static_fence();
+  Cuda::impl_static_fence(
+      "Kokkos::CudaUVMSpace::impl_deallocate: Pre UVM Deallocation");
   if (Kokkos::Profiling::profileLibraryLoaded()) {
     const size_t reported_size =
         (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
@@ -372,11 +386,12 @@ void CudaUVMSpace::impl_deallocate(
   try {
     if (arg_alloc_ptr != nullptr) {
       Kokkos::Impl::num_uvm_allocations--;
-      CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
     }
   } catch (...) {
   }
-  Cuda::impl_static_fence();
+  Cuda::impl_static_fence(
+      "Kokkos::CudaUVMSpace::impl_deallocate: Post UVM Deallocation");
 }
 
 void CudaHostPinnedSpace::deallocate(void *const arg_alloc_ptr,
@@ -401,7 +416,7 @@ void CudaHostPinnedSpace::impl_deallocate(
                                       reported_size);
   }
   try {
-    CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr));
   } catch (...) {
   }
 }
@@ -462,7 +477,7 @@ SharedAllocationRecord<Kokkos::CudaSpace, void>::attach_texture_object(
   resDesc.res.linear.sizeInBytes = alloc_size;
   resDesc.res.linear.devPtr      = alloc_ptr;
 
-  CUDA_SAFE_CALL(
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
       cudaCreateTextureObject(&tex_obj, &resDesc, &texDesc, nullptr));
 
   return tex_obj;
@@ -581,7 +596,7 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,
                            bool to_device) {
   if ((ptr == nullptr) || (bytes == 0)) return;
   cudaPointerAttributes attr;
-  CUDA_SAFE_CALL(cudaPointerGetAttributes(&attr, ptr));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaPointerGetAttributes(&attr, ptr));
   // I measured this and it turns out prefetching towards the host slows
   // DualView syncs down. Probably because the latency is not too bad in the
   // first place for the pull down. If we want to change that provde
@@ -593,8 +608,8 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,
 #endif
   if (to_device && is_managed &&
       space.cuda_device_prop().concurrentManagedAccess) {
-    CUDA_SAFE_CALL(cudaMemPrefetchAsync(ptr, bytes, space.cuda_device(),
-                                        space.cuda_stream()));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemPrefetchAsync(
+        ptr, bytes, space.cuda_device(), space.cuda_stream()));
   }
 }
 
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
index 0f4259072d97f26c0032e674bdf60b9031fcee11..993c8d1bbadc4ebff2fcc9bdc905fca6bb37a9cf 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -134,7 +134,12 @@ inline int cuda_deduce_block_size(bool early_termination,
     }
 
     if (blocks_per_sm >= min_blocks_per_sm) {
-      if (threads_per_sm >= opt_threads_per_sm) {
+      // The logic prefers smaller block sizes over larger ones to
+      // give more flexibility to the scheduler.
+      // But don't go below 128 where performance suffers significantly
+      // for simple copy/set kernels.
+      if ((threads_per_sm > opt_threads_per_sm) ||
+          ((block_size >= 128) && (threads_per_sm == opt_threads_per_sm))) {
         opt_block_size     = block_size;
         opt_threads_per_sm = threads_per_sm;
       }
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
index 4759001d81f99afc0a1e2aa6cf64462d9e7fcdc9..36df0d2564ae8ab86849837cf60cb6d93727aab2 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
@@ -49,13 +49,19 @@
 #ifdef KOKKOS_ENABLE_CUDA
 
 #include <impl/Kokkos_Error.hpp>
-
+#include <impl/Kokkos_Profiling.hpp>
 #include <iosfwd>
 
 namespace Kokkos {
 namespace Impl {
 
-void cuda_device_synchronize();
+void cuda_stream_synchronize(
+    const cudaStream_t stream,
+    Kokkos::Tools::Experimental::SpecialSynchronizationCases reason,
+    const std::string& name);
+void cuda_device_synchronize(const std::string& name);
+void cuda_stream_synchronize(const cudaStream_t stream,
+                             const std::string& name);
 
 void cuda_internal_error_throw(cudaError e, const char* name,
                                const char* file = nullptr, const int line = 0);
@@ -68,9 +74,24 @@ inline void cuda_internal_safe_call(cudaError e, const char* name,
   }
 }
 
-#define CUDA_SAFE_CALL(call) \
+#define KOKKOS_IMPL_CUDA_SAFE_CALL(call) \
   Kokkos::Impl::cuda_internal_safe_call(call, #call, __FILE__, __LINE__)
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+
+KOKKOS_DEPRECATED
+inline void cuda_internal_safe_call_deprecated(cudaError e, const char* name,
+                                               const char* file = nullptr,
+                                               const int line   = 0) {
+  cuda_internal_safe_call(e, name, file, line);
+}
+
+#define CUDA_SAFE_CALL(call)                                              \
+  Kokkos::Impl::cuda_internal_safe_call_deprecated(call, #call, __FILE__, \
+                                                   __LINE__)
+
+#endif
+
 }  // namespace Impl
 
 namespace Experimental {
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp
index 3de7a69916130de41077bae684df0cbc87daea4b..bd514f5e88d915b46eccc6ccd5da7baa311088e3 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp
@@ -60,6 +60,7 @@
 
 #include <Kokkos_Cuda.hpp>
 #include <cuda_runtime_api.h>
+#include <Cuda/Kokkos_Cuda_Error.hpp>
 
 namespace Kokkos {
 namespace Impl {
@@ -82,8 +83,8 @@ struct GraphImpl<Kokkos::Cuda> {
     constexpr size_t error_log_size = 256;
     cudaGraphNode_t error_node      = nullptr;
     char error_log[error_log_size];
-    CUDA_SAFE_CALL(cudaGraphInstantiate(&m_graph_exec, m_graph, &error_node,
-                                        error_log, error_log_size));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphInstantiate(
+        &m_graph_exec, m_graph, &error_node, error_log, error_log_size));
     // TODO @graphs print out errors
   }
 
@@ -107,26 +108,27 @@ struct GraphImpl<Kokkos::Cuda> {
     // TODO @graphs we need to somehow indicate the need for a fence in the
     //              destructor of the GraphImpl object (so that we don't have to
     //              just always do it)
-    m_execution_space.fence();
+    m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction");
     KOKKOS_EXPECTS(bool(m_graph))
     if (bool(m_graph_exec)) {
-      CUDA_SAFE_CALL(cudaGraphExecDestroy(m_graph_exec));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphExecDestroy(m_graph_exec));
     }
-    CUDA_SAFE_CALL(cudaGraphDestroy(m_graph));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphDestroy(m_graph));
   };
 
   explicit GraphImpl(Kokkos::Cuda arg_instance)
       : m_execution_space(std::move(arg_instance)) {
-    CUDA_SAFE_CALL(cudaGraphCreate(&m_graph, cuda_graph_flags_t{0}));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaGraphCreate(&m_graph, cuda_graph_flags_t{0}));
   }
 
   void add_node(std::shared_ptr<aggregate_node_impl_t> const& arg_node_ptr) {
     // All of the predecessors are just added as normal, so all we need to
     // do here is add an empty node
-    CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&(arg_node_ptr->node_details_t::node),
-                                         m_graph,
-                                         /* dependencies = */ nullptr,
-                                         /* numDependencies = */ 0));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaGraphAddEmptyNode(&(arg_node_ptr->node_details_t::node), m_graph,
+                              /* dependencies = */ nullptr,
+                              /* numDependencies = */ 0));
   }
 
   template <class NodeImpl>
@@ -171,7 +173,7 @@ struct GraphImpl<Kokkos::Cuda> {
     auto /*const*/& cuda_node = arg_node_ptr->node_details_t::node;
     KOKKOS_EXPECTS(bool(cuda_node))
 
-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
         cudaGraphAddDependencies(m_graph, &pred_cuda_node, &cuda_node, 1));
   }
 
@@ -179,7 +181,7 @@ struct GraphImpl<Kokkos::Cuda> {
     if (!bool(m_graph_exec)) {
       _instantiate_graph();
     }
-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
         cudaGraphLaunch(m_graph_exec, m_execution_space.cuda_stream()));
   }
 
@@ -192,9 +194,10 @@ struct GraphImpl<Kokkos::Cuda> {
     KOKKOS_EXPECTS(!bool(m_graph_exec))
     auto rv = std::make_shared<root_node_impl_t>(
         get_execution_space(), _graph_node_is_root_ctor_tag{});
-    CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&(rv->node_details_t::node), m_graph,
-                                         /* dependencies = */ nullptr,
-                                         /* numDependencies = */ 0));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaGraphAddEmptyNode(&(rv->node_details_t::node), m_graph,
+                              /* dependencies = */ nullptr,
+                              /* numDependencies = */ 0));
     KOKKOS_ENSURES(bool(rv->node_details_t::node))
     return rv;
   }
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp
index ec9c434fe663900a5d5029896a5c98ce13266605..c81286eb1004b10219b64f38563bc3e8af257ae9 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp
@@ -51,6 +51,9 @@
     !(defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL50) ||  \
       defined(KOKKOS_ARCH_MAXWELL52))
 #include <cuda_fp16.h>
+#include <iosfwd>  // istream & ostream for extraction and insertion ops
+#include <string>
+#include <Kokkos_NumericTraits.hpp>  // reduction_identity
 
 #ifndef KOKKOS_IMPL_HALF_TYPE_DEFINED
 // Make sure no one else tries to define half_t
@@ -127,7 +130,7 @@ KOKKOS_INLINE_FUNCTION
     std::enable_if_t<std::is_same<T, unsigned long long>::value, T>
         cast_from_half(half_t);
 
-class half_t {
+class alignas(2) half_t {
  public:
   using impl_type = Kokkos::Impl::half_impl_t::type;
 
@@ -138,6 +141,22 @@ class half_t {
   KOKKOS_FUNCTION
   half_t() : val(0.0F) {}
 
+  // Copy constructors
+  KOKKOS_DEFAULTED_FUNCTION
+  half_t(const half_t&) noexcept = default;
+
+  KOKKOS_INLINE_FUNCTION
+  half_t(const volatile half_t& rhs) {
+#ifdef __CUDA_ARCH__
+    val = rhs.val;
+#else
+    const volatile uint16_t* rv_ptr =
+        reinterpret_cast<const volatile uint16_t*>(&rhs.val);
+    const uint16_t rv_val = *rv_ptr;
+    val                   = reinterpret_cast<const impl_type&>(rv_val);
+#endif  // __CUDA_ARCH__
+  }
+
   // Don't support implicit conversion back to impl_type.
   // impl_type is a storage only type on host.
   KOKKOS_FUNCTION
@@ -219,7 +238,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     tmp.val = +tmp.val;
 #else
-    tmp.val   = __float2half(+__half2float(tmp.val));
+    tmp.val               = __float2half(+__half2float(tmp.val));
 #endif
     return tmp;
   }
@@ -230,7 +249,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     tmp.val = -tmp.val;
 #else
-    tmp.val   = __float2half(-__half2float(tmp.val));
+    tmp.val               = __float2half(-__half2float(tmp.val));
 #endif
     return tmp;
   }
@@ -241,7 +260,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     ++val;
 #else
-    float tmp = __half2float(val);
+    float tmp             = __half2float(val);
     ++tmp;
     val       = __float2half(tmp);
 #endif
@@ -255,7 +274,7 @@ class half_t {
 #else
     float tmp = __half2float(val);
     --tmp;
-    val = __float2half(tmp);
+    val     = __float2half(tmp);
 #endif
     return *this;
   }
@@ -290,7 +309,10 @@ class half_t {
 
   template <class T>
   KOKKOS_FUNCTION void operator=(T rhs) volatile {
-    val = cast_to_half(rhs).val;
+    impl_type new_val = cast_to_half(rhs).val;
+    volatile uint16_t* val_ptr =
+        reinterpret_cast<volatile uint16_t*>(const_cast<impl_type*>(&val));
+    *val_ptr = reinterpret_cast<uint16_t&>(new_val);
   }
 
   // Compound operators
@@ -299,30 +321,21 @@ class half_t {
 #ifdef __CUDA_ARCH__
     val += rhs.val;
 #else
-    val = __float2half(__half2float(val) + __half2float(rhs.val));
+    val     = __float2half(__half2float(val) + __half2float(rhs.val));
 #endif
     return *this;
   }
 
   KOKKOS_FUNCTION
-  volatile half_t& operator+=(half_t rhs) volatile {
-#ifdef __CUDA_ARCH__
-    // Cuda 10 supports __half volatile stores but not volatile arithmetic
-    // operands. Cast away volatile-ness of val for arithmetic but not for store
-    // location.
-    val = const_cast<impl_type&>(val) + rhs.val;
-#else
-    // Use non-volatile val_ref to suppress:
-    // "warning: implicit dereference will not access object of type ‘volatile
-    // __half’ in statement"
-    auto val_ref = const_cast<impl_type&>(val);
-    val_ref      = __float2half(__half2float(const_cast<impl_type&>(val)) +
-                           __half2float(rhs.val));
-#endif
-    return *this;
+  void operator+=(const volatile half_t& rhs) volatile {
+    half_t tmp_rhs = rhs;
+    half_t tmp_lhs = *this;
+
+    tmp_lhs += tmp_rhs;
+    *this = tmp_lhs;
   }
 
-  // Compund operators: upcast overloads for +=
+  // Compound operators: upcast overloads for +=
   template <class T>
   KOKKOS_FUNCTION std::enable_if_t<
       std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend
@@ -350,27 +363,18 @@ class half_t {
 #ifdef __CUDA_ARCH__
     val -= rhs.val;
 #else
-    val          = __float2half(__half2float(val) - __half2float(rhs.val));
+    val     = __float2half(__half2float(val) - __half2float(rhs.val));
 #endif
     return *this;
   }
 
   KOKKOS_FUNCTION
-  volatile half_t& operator-=(half_t rhs) volatile {
-#ifdef __CUDA_ARCH__
-    // Cuda 10 supports __half volatile stores but not volatile arithmetic
-    // operands. Cast away volatile-ness of val for arithmetic but not for store
-    // location.
-    val = const_cast<impl_type&>(val) - rhs.val;
-#else
-    // Use non-volatile val_ref to suppress:
-    // "warning: implicit dereference will not access object of type ‘volatile
-    // __half’ in statement"
-    auto val_ref = const_cast<impl_type&>(val);
-    val_ref      = __float2half(__half2float(const_cast<impl_type&>(val)) -
-                           __half2float(rhs.val));
-#endif
-    return *this;
+  void operator-=(const volatile half_t& rhs) volatile {
+    half_t tmp_rhs = rhs;
+    half_t tmp_lhs = *this;
+
+    tmp_lhs -= tmp_rhs;
+    *this = tmp_lhs;
   }
 
   // Compund operators: upcast overloads for -=
@@ -401,27 +405,18 @@ class half_t {
 #ifdef __CUDA_ARCH__
     val *= rhs.val;
 #else
-    val          = __float2half(__half2float(val) * __half2float(rhs.val));
+    val     = __float2half(__half2float(val) * __half2float(rhs.val));
 #endif
     return *this;
   }
 
   KOKKOS_FUNCTION
-  volatile half_t& operator*=(half_t rhs) volatile {
-#ifdef __CUDA_ARCH__
-    // Cuda 10 supports __half volatile stores but not volatile arithmetic
-    // operands. Cast away volatile-ness of val for arithmetic but not for store
-    // location.
-    val = const_cast<impl_type&>(val) * rhs.val;
-#else
-    // Use non-volatile val_ref to suppress:
-    // "warning: implicit dereference will not access object of type ‘volatile
-    // __half’ in statement"
-    auto val_ref = const_cast<impl_type&>(val);
-    val_ref      = __float2half(__half2float(const_cast<impl_type&>(val)) *
-                           __half2float(rhs.val));
-#endif
-    return *this;
+  void operator*=(const volatile half_t& rhs) volatile {
+    half_t tmp_rhs = rhs;
+    half_t tmp_lhs = *this;
+
+    tmp_lhs *= tmp_rhs;
+    *this = tmp_lhs;
   }
 
   // Compund operators: upcast overloads for *=
@@ -452,27 +447,18 @@ class half_t {
 #ifdef __CUDA_ARCH__
     val /= rhs.val;
 #else
-    val          = __float2half(__half2float(val) / __half2float(rhs.val));
+    val     = __float2half(__half2float(val) / __half2float(rhs.val));
 #endif
     return *this;
   }
 
   KOKKOS_FUNCTION
-  volatile half_t& operator/=(half_t rhs) volatile {
-#ifdef __CUDA_ARCH__
-    // Cuda 10 supports __half volatile stores but not volatile arithmetic
-    // operands. Cast away volatile-ness of val for arithmetic but not for store
-    // location.
-    val = const_cast<impl_type&>(val) / rhs.val;
-#else
-    // Use non-volatile val_ref to suppress:
-    // "warning: implicit dereference will not access object of type ‘volatile
-    // __half’ in statement"
-    auto val_ref = const_cast<impl_type&>(val);
-    val_ref      = __float2half(__half2float(const_cast<impl_type&>(val)) /
-                           __half2float(rhs.val));
-#endif
-    return *this;
+  void operator/=(const volatile half_t& rhs) volatile {
+    half_t tmp_rhs = rhs;
+    half_t tmp_lhs = *this;
+
+    tmp_lhs /= tmp_rhs;
+    *this = tmp_lhs;
   }
 
   // Compund operators: upcast overloads for /=
@@ -504,7 +490,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     lhs.val += rhs.val;
 #else
-    lhs.val      = __float2half(__half2float(lhs.val) + __half2float(rhs.val));
+    lhs.val = __float2half(__half2float(lhs.val) + __half2float(rhs.val));
 #endif
     return lhs;
   }
@@ -529,7 +515,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     lhs.val -= rhs.val;
 #else
-    lhs.val      = __float2half(__half2float(lhs.val) - __half2float(rhs.val));
+    lhs.val = __float2half(__half2float(lhs.val) - __half2float(rhs.val));
 #endif
     return lhs;
   }
@@ -554,7 +540,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     lhs.val *= rhs.val;
 #else
-    lhs.val      = __float2half(__half2float(lhs.val) * __half2float(rhs.val));
+    lhs.val = __float2half(__half2float(lhs.val) * __half2float(rhs.val));
 #endif
     return lhs;
   }
@@ -579,7 +565,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     lhs.val /= rhs.val;
 #else
-    lhs.val      = __float2half(__half2float(lhs.val) / __half2float(rhs.val));
+    lhs.val = __float2half(__half2float(lhs.val) / __half2float(rhs.val));
 #endif
     return lhs;
   }
@@ -683,6 +669,62 @@ class half_t {
     return __half2float(val) >= __half2float(rhs.val);
 #endif
   }
+
+  KOKKOS_FUNCTION
+  friend bool operator==(const volatile half_t& lhs,
+                         const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs == tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator!=(const volatile half_t& lhs,
+                         const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs != tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator<(const volatile half_t& lhs,
+                        const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs < tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator>(const volatile half_t& lhs,
+                        const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs > tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator<=(const volatile half_t& lhs,
+                         const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs <= tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator>=(const volatile half_t& lhs,
+                         const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs >= tmp_rhs;
+  }
+
+  // Insertion and extraction operators
+  friend std::ostream& operator<<(std::ostream& os, const half_t& x) {
+    const std::string out = std::to_string(static_cast<double>(x));
+    os << out;
+    return os;
+  }
+
+  friend std::istream& operator>>(std::istream& is, half_t& x) {
+    std::string in;
+    is >> in;
+    x = std::stod(in);
+    return is;
+  }
 };
 
 // CUDA before 11.1 only has the half <-> float conversions marked host device
@@ -943,6 +985,25 @@ KOKKOS_INLINE_FUNCTION
 }
 #endif
 }  // namespace Experimental
+
+// use float as the return type for sum and prod since cuda_fp16.h
+// has no constexpr functions for casting to __half
+template <>
+struct reduction_identity<Kokkos::Experimental::half_t> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum() noexcept {
+    return 0.0F;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() noexcept {
+    return 1.0F;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() noexcept {
+    return -65504.0F;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float min() noexcept {
+    return 65504.0F;
+  }
+};
+
 }  // namespace Kokkos
 #endif  // KOKKOS_IMPL_HALF_TYPE_DEFINED
 #endif  // KOKKOS_ENABLE_CUDA
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
index 016cb6cdcbdd37740613724bb99efb9b4c32d7d4..6964d5b41b72368e9b4305d37e156e08321f7814 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -119,7 +119,7 @@ int cuda_kernel_arch() {
   int arch    = 0;
   int *d_arch = nullptr;
 
-  cudaMalloc((void **)&d_arch, sizeof(int));
+  cudaMalloc(reinterpret_cast<void **>(&d_arch), sizeof(int));
   cudaMemcpy(d_arch, &arch, sizeof(int), cudaMemcpyDefault);
 
   query_cuda_kernel_arch<<<1, 1>>>(d_arch);
@@ -141,7 +141,36 @@ bool cuda_launch_blocking() {
 
 }  // namespace
 
-void cuda_device_synchronize() { CUDA_SAFE_CALL(cudaDeviceSynchronize()); }
+void cuda_device_synchronize(const std::string &name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Cuda>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      []() {  // TODO: correct device ID
+        KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+      });
+}
+
+void cuda_stream_synchronize(const cudaStream_t stream, const CudaInternal *ptr,
+                             const std::string &name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Cuda>(
+      name,
+      Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{
+          ptr->impl_get_instance_id()},
+      [&]() {  // TODO: correct device ID
+        KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
+      });
+}
+
+void cuda_stream_synchronize(
+    const cudaStream_t stream,
+    Kokkos::Tools::Experimental::SpecialSynchronizationCases reason,
+    const std::string &name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Cuda>(
+      name, reason, [&]() {  // TODO: correct device ID
+        KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
+      });
+}
 
 void cuda_internal_error_throw(cudaError e, const char *name, const char *file,
                                const int line) {
@@ -221,7 +250,7 @@ CudaInternalDevices::CudaInternalDevices() {
   // See 'cudaSetDeviceFlags' for host-device thread interaction
   // Section 4.4.2.6 of the CUDA Toolkit Reference Manual
 
-  CUDA_SAFE_CALL(cudaGetDeviceCount(&m_cudaDevCount));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&m_cudaDevCount));
 
   if (m_cudaDevCount > MAXIMUM_DEVICE_COUNT) {
     Kokkos::abort(
@@ -229,7 +258,7 @@ CudaInternalDevices::CudaInternalDevices() {
         "have. Please report this to github.com/kokkos/kokkos.");
   }
   for (int i = 0; i < m_cudaDevCount; ++i) {
-    CUDA_SAFE_CALL(cudaGetDeviceProperties(m_cudaProp + i, i));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(m_cudaProp + i, i));
   }
 }
 
@@ -277,25 +306,27 @@ CudaInternal::~CudaInternal() {
               << std::endl;
   }
 
-  m_cudaDev                   = -1;
-  m_cudaArch                  = -1;
-  m_multiProcCount            = 0;
-  m_maxWarpCount              = 0;
-  m_maxBlock                  = 0;
-  m_maxSharedWords            = 0;
-  m_maxConcurrency            = 0;
-  m_scratchSpaceCount         = 0;
-  m_scratchFlagsCount         = 0;
-  m_scratchUnifiedCount       = 0;
-  m_scratchUnifiedSupported   = 0;
-  m_streamCount               = 0;
-  m_scratchSpace              = nullptr;
-  m_scratchFlags              = nullptr;
-  m_scratchUnified            = nullptr;
-  m_scratchConcurrentBitset   = nullptr;
-  m_stream                    = nullptr;
-  m_team_scratch_current_size = 0;
-  m_team_scratch_ptr          = nullptr;
+  m_cudaDev                 = -1;
+  m_cudaArch                = -1;
+  m_multiProcCount          = 0;
+  m_maxWarpCount            = 0;
+  m_maxBlock                = 0;
+  m_maxSharedWords          = 0;
+  m_maxConcurrency          = 0;
+  m_scratchSpaceCount       = 0;
+  m_scratchFlagsCount       = 0;
+  m_scratchUnifiedCount     = 0;
+  m_scratchUnifiedSupported = 0;
+  m_streamCount             = 0;
+  m_scratchSpace            = nullptr;
+  m_scratchFlags            = nullptr;
+  m_scratchUnified          = nullptr;
+  m_scratchConcurrentBitset = nullptr;
+  m_stream                  = nullptr;
+  for (int i = 0; i < m_n_team_scratch; ++i) {
+    m_team_scratch_current_size[i] = 0;
+    m_team_scratch_ptr[i]          = nullptr;
+  }
 }
 
 int CudaInternal::verify_is_initialized(const char *const label) const {
@@ -305,16 +336,20 @@ int CudaInternal::verify_is_initialized(const char *const label) const {
   }
   return 0 <= m_cudaDev;
 }
-
+uint32_t CudaInternal::impl_get_instance_id() const { return m_instance_id; }
 CudaInternal &CudaInternal::singleton() {
   static CudaInternal self;
   return self;
 }
+void CudaInternal::fence(const std::string &name) const {
+  Impl::cuda_stream_synchronize(m_stream, this, name);
+}
 void CudaInternal::fence() const {
-  CUDA_SAFE_CALL(cudaStreamSynchronize(m_stream));
+  fence("Kokkos::CudaInternal::fence(): Unnamed Instance Fence");
 }
 
-void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
+void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream,
+                              bool manage_stream) {
   if (was_finalized)
     Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n");
   was_initialized = true;
@@ -350,8 +385,9 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
     m_cudaDev    = cuda_device_id;
     m_deviceProp = cudaProp;
 
-    CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev));
-    Kokkos::Impl::cuda_device_synchronize();
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev));
+    Kokkos::Impl::cuda_device_synchronize(
+        "Kokkos::CudaInternal::initialize: Fence on space initialization");
 
     // Query what compute capability architecture a kernel executes:
     m_cudaArch = cuda_kernel_arch();
@@ -464,8 +500,8 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
 
       m_scratchConcurrentBitset = reinterpret_cast<uint32_t *>(r->data());
 
-      CUDA_SAFE_CALL(cudaMemset(m_scratchConcurrentBitset, 0,
-                                sizeof(uint32_t) * buffer_bound));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemset(m_scratchConcurrentBitset, 0,
+                                            sizeof(uint32_t) * buffer_bound));
     }
     //----------------------------------
 
@@ -535,15 +571,19 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
   // Allocate a staging buffer for constant mem in pinned host memory
   // and an event to avoid overwriting driver for previous kernel launches
   if (stream == nullptr) {
-    CUDA_SAFE_CALL(cudaMallocHost((void **)&constantMemHostStaging,
-                                  CudaTraits::ConstantMemoryUsage));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaMallocHost(reinterpret_cast<void **>(&constantMemHostStaging),
+                       CudaTraits::ConstantMemoryUsage));
 
-    CUDA_SAFE_CALL(cudaEventCreate(&constantMemReusable));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventCreate(&constantMemReusable));
   }
 
-  m_stream                    = stream;
-  m_team_scratch_current_size = 0;
-  m_team_scratch_ptr          = nullptr;
+  m_stream        = stream;
+  m_manage_stream = manage_stream;
+  for (int i = 0; i < m_n_team_scratch; ++i) {
+    m_team_scratch_current_size[i] = 0;
+    m_team_scratch_ptr[i]          = nullptr;
+  }
 }
 
 //----------------------------------------------------------------------------
@@ -569,7 +609,7 @@ Cuda::size_type *CudaInternal::scratch_flags(const Cuda::size_type size) const {
 
     m_scratchFlags = reinterpret_cast<size_type *>(r->data());
 
-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
         cudaMemset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain));
   }
 
@@ -645,20 +685,37 @@ Cuda::size_type *CudaInternal::scratch_functor(
   return m_scratchFunctor;
 }
 
-void *CudaInternal::resize_team_scratch_space(std::int64_t bytes,
-                                              bool force_shrink) {
-  if (m_team_scratch_current_size == 0) {
-    m_team_scratch_current_size = bytes;
-    m_team_scratch_ptr          = Kokkos::kokkos_malloc<Kokkos::CudaSpace>(
-        "Kokkos::CudaSpace::TeamScratchMemory", m_team_scratch_current_size);
+std::pair<void *, int> CudaInternal::resize_team_scratch_space(
+    std::int64_t bytes, bool force_shrink) {
+  // Multiple ParallelFor/Reduce Teams can call this function at the same time
+  // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race
+  // condition.
+
+  int current_team_scratch = 0;
+  int zero                 = 0;
+  int one                  = 1;
+  while (m_team_scratch_pool[current_team_scratch].compare_exchange_weak(
+      zero, one, std::memory_order_release, std::memory_order_relaxed)) {
+    current_team_scratch = (current_team_scratch + 1) % m_n_team_scratch;
   }
-  if ((bytes > m_team_scratch_current_size) ||
-      ((bytes < m_team_scratch_current_size) && (force_shrink))) {
-    m_team_scratch_current_size = bytes;
-    m_team_scratch_ptr          = Kokkos::kokkos_realloc<Kokkos::CudaSpace>(
-        m_team_scratch_ptr, m_team_scratch_current_size);
+  if (m_team_scratch_current_size[current_team_scratch] == 0) {
+    m_team_scratch_current_size[current_team_scratch] = bytes;
+    m_team_scratch_ptr[current_team_scratch] =
+        Kokkos::kokkos_malloc<Kokkos::CudaSpace>(
+            "Kokkos::CudaSpace::TeamScratchMemory",
+            m_team_scratch_current_size[current_team_scratch]);
   }
-  return m_team_scratch_ptr;
+  if ((bytes > m_team_scratch_current_size[current_team_scratch]) ||
+      ((bytes < m_team_scratch_current_size[current_team_scratch]) &&
+       (force_shrink))) {
+    m_team_scratch_current_size[current_team_scratch] = bytes;
+    m_team_scratch_ptr[current_team_scratch] =
+        Kokkos::kokkos_realloc<Kokkos::CudaSpace>(
+            m_team_scratch_ptr[current_team_scratch],
+            m_team_scratch_current_size[current_team_scratch]);
+  }
+  return std::make_pair(m_team_scratch_ptr[current_team_scratch],
+                        current_team_scratch);
 }
 
 //----------------------------------------------------------------------------
@@ -685,36 +742,43 @@ void CudaInternal::finalize() {
     if (m_scratchFunctorSize > 0)
       RecordCuda::decrement(RecordCuda::get_record(m_scratchFunctor));
 
-    if (m_team_scratch_current_size > 0)
-      Kokkos::kokkos_free<Kokkos::CudaSpace>(m_team_scratch_ptr);
-
-    m_cudaDev                   = -1;
-    m_multiProcCount            = 0;
-    m_maxWarpCount              = 0;
-    m_maxBlock                  = 0;
-    m_maxSharedWords            = 0;
-    m_scratchSpaceCount         = 0;
-    m_scratchFlagsCount         = 0;
-    m_scratchUnifiedCount       = 0;
-    m_streamCount               = 0;
-    m_scratchSpace              = nullptr;
-    m_scratchFlags              = nullptr;
-    m_scratchUnified            = nullptr;
-    m_scratchConcurrentBitset   = nullptr;
-    m_stream                    = nullptr;
-    m_team_scratch_current_size = 0;
-    m_team_scratch_ptr          = nullptr;
+    for (int i = 0; i < m_n_team_scratch; ++i) {
+      if (m_team_scratch_current_size[i] > 0)
+        Kokkos::kokkos_free<Kokkos::CudaSpace>(m_team_scratch_ptr[i]);
+    }
+
+    if (m_manage_stream && m_stream != nullptr)
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(m_stream));
+
+    m_cudaDev                 = -1;
+    m_multiProcCount          = 0;
+    m_maxWarpCount            = 0;
+    m_maxBlock                = 0;
+    m_maxSharedWords          = 0;
+    m_scratchSpaceCount       = 0;
+    m_scratchFlagsCount       = 0;
+    m_scratchUnifiedCount     = 0;
+    m_streamCount             = 0;
+    m_scratchSpace            = nullptr;
+    m_scratchFlags            = nullptr;
+    m_scratchUnified          = nullptr;
+    m_scratchConcurrentBitset = nullptr;
+    m_stream                  = nullptr;
+    for (int i = 0; i < m_n_team_scratch; ++i) {
+      m_team_scratch_current_size[i] = 0;
+      m_team_scratch_ptr[i]          = nullptr;
+    }
   }
 
   // only destroy these if we're finalizing the singleton
   if (this == &singleton()) {
-    cudaFreeHost(constantMemHostStaging);
-    cudaEventDestroy(constantMemReusable);
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(constantMemHostStaging));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventDestroy(constantMemReusable));
     auto &deep_copy_space =
         Kokkos::Impl::cuda_get_deep_copy_space(/*initialize*/ false);
     if (deep_copy_space)
       deep_copy_space->impl_internal_space_instance()->finalize();
-    cudaStreamDestroy(cuda_get_deep_copy_stream());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(cuda_get_deep_copy_stream()));
   }
 }
 
@@ -823,7 +887,7 @@ Cuda::Cuda()
       "Cuda instance constructor");
 }
 
-Cuda::Cuda(cudaStream_t stream)
+Cuda::Cuda(cudaStream_t stream, bool manage_stream)
     : m_space_instance(new Impl::CudaInternal, [](Impl::CudaInternal *ptr) {
         ptr->finalize();
         delete ptr;
@@ -831,18 +895,31 @@ Cuda::Cuda(cudaStream_t stream)
   Impl::CudaInternal::singleton().verify_is_initialized(
       "Cuda instance constructor");
   m_space_instance->initialize(Impl::CudaInternal::singleton().m_cudaDev,
-                               stream);
+                               stream, manage_stream);
 }
 
 void Cuda::print_configuration(std::ostream &s, const bool) {
   Impl::CudaInternal::singleton().print_configuration(s);
 }
 
-void Cuda::impl_static_fence() { Kokkos::Impl::cuda_device_synchronize(); }
+void Cuda::impl_static_fence(const std::string &name) {
+  Kokkos::Impl::cuda_device_synchronize(name);
+}
+void Cuda::impl_static_fence() {
+  impl_static_fence("Kokkos::Cuda::impl_static_fence(): Unnamed Static Fence");
+}
 
-void Cuda::fence() const { m_space_instance->fence(); }
+void Cuda::fence() const {
+  fence("Kokkos::Cuda::fence(): Unnamed Instance Fence");
+}
+void Cuda::fence(const std::string &name) const {
+  m_space_instance->fence(name);
+}
 
 const char *Cuda::name() { return "Cuda"; }
+uint32_t Cuda::impl_instance_id() const noexcept {
+  return m_space_instance->impl_get_instance_id();
+}
 
 cudaStream_t Cuda::cuda_stream() const { return m_space_instance->m_stream; }
 int Cuda::cuda_device() const { return m_space_instance->m_cudaDev; }
@@ -877,7 +954,15 @@ void CudaSpaceInitializer::finalize(bool all_spaces) {
   }
 }
 
-void CudaSpaceInitializer::fence() { Kokkos::Cuda::impl_static_fence(); }
+void CudaSpaceInitializer::fence() {
+  Kokkos::Cuda::impl_static_fence(
+      "Kokkos::CudaSpaceInitializer::fence: Initializer Fence");
+}
+void CudaSpaceInitializer::fence(const std::string &name) {
+  // Kokkos::Cuda::impl_static_fence("Kokkos::CudaSpaceInitializer::fence:
+  // "+name); //TODO: or this
+  Kokkos::Cuda::impl_static_fence(name);
+}
 
 void CudaSpaceInitializer::print_configuration(std::ostream &msg,
                                                const bool detail) {
@@ -916,12 +1001,6 @@ void CudaSpaceInitializer::print_configuration(std::ostream &msg,
   msg << "yes\n";
 #else
   msg << "no\n";
-#endif
-  msg << "  KOKKOS_ENABLE_CUSPARSE: ";
-#ifdef KOKKOS_ENABLE_CUSPARSE
-  msg << "yes\n";
-#else
-  msg << "no\n";
 #endif
   msg << "  KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: ";
 #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp
index aaec2c29260a5ad2b82e2daa653a58372253cd4d..7eb169838c05dc144e9789d4466f83d3febfe926 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp
@@ -3,6 +3,9 @@
 
 #include <vector>
 #include <impl/Kokkos_Tools.hpp>
+#include <atomic>
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 // These functions fulfill the purpose of allowing to work around
@@ -114,10 +117,14 @@ class CudaInternal {
   mutable size_type* m_scratchFunctor;
   uint32_t* m_scratchConcurrentBitset;
   cudaStream_t m_stream;
+  uint32_t m_instance_id;
+  bool m_manage_stream;
 
   // Team Scratch Level 1 Space
-  mutable int64_t m_team_scratch_current_size;
-  mutable void* m_team_scratch_ptr;
+  int m_n_team_scratch = 10;
+  mutable int64_t m_team_scratch_current_size[10];
+  mutable void* m_team_scratch_ptr[10];
+  mutable std::atomic_int m_team_scratch_pool[10];
 
   bool was_initialized = false;
   bool was_finalized   = false;
@@ -135,7 +142,8 @@ class CudaInternal {
     return nullptr != m_scratchSpace && nullptr != m_scratchFlags;
   }
 
-  void initialize(int cuda_device_id, cudaStream_t stream = nullptr);
+  void initialize(int cuda_device_id, cudaStream_t stream = nullptr,
+                  bool manage_stream = false);
   void finalize();
 
   void print_configuration(std::ostream&) const;
@@ -145,6 +153,7 @@ class CudaInternal {
   static void cuda_set_serial_execution(bool);
 #endif
 
+  void fence(const std::string&) const;
   void fence() const;
 
   ~CudaInternal();
@@ -175,20 +184,68 @@ class CudaInternal {
         m_scratchFunctor(nullptr),
         m_scratchConcurrentBitset(nullptr),
         m_stream(nullptr),
-        m_team_scratch_current_size(0),
-        m_team_scratch_ptr(nullptr) {}
+        m_instance_id(
+            Kokkos::Tools::Experimental::Impl::idForInstance<Kokkos::Cuda>(
+                reinterpret_cast<uintptr_t>(this))) {
+    for (int i = 0; i < m_n_team_scratch; ++i) {
+      m_team_scratch_current_size[i] = 0;
+      m_team_scratch_ptr[i]          = nullptr;
+      m_team_scratch_pool[i]         = 0;
+    }
+  }
 
   // Resizing of reduction related scratch spaces
   size_type* scratch_space(const size_type size) const;
   size_type* scratch_flags(const size_type size) const;
   size_type* scratch_unified(const size_type size) const;
   size_type* scratch_functor(const size_type size) const;
-
+  uint32_t impl_get_instance_id() const;
   // Resizing of team level 1 scratch
-  void* resize_team_scratch_space(std::int64_t bytes,
-                                  bool force_shrink = false);
+  std::pair<void*, int> resize_team_scratch_space(std::int64_t bytes,
+                                                  bool force_shrink = false);
 };
 
 }  // Namespace Impl
+
+namespace Experimental {
+// Partitioning an Execution Space: expects space and integer arguments for
+// relative weight
+//   Customization point for backends
+//   Default behavior is to return the passed in instance
+
+namespace Impl {
+inline void create_Cuda_instances(std::vector<Cuda>& instances) {
+  for (int s = 0; s < int(instances.size()); s++) {
+    cudaStream_t stream;
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&stream));
+    instances[s] = Cuda(stream, true);
+  }
+}
+}  // namespace Impl
+
+template <class... Args>
+std::vector<Cuda> partition_space(const Cuda&, Args...) {
+#ifdef __cpp_fold_expressions
+  static_assert(
+      (... && std::is_arithmetic_v<Args>),
+      "Kokkos Error: partitioning arguments must be integers or floats");
+#endif
+  std::vector<Cuda> instances(sizeof...(Args));
+  Impl::create_Cuda_instances(instances);
+  return instances;
+}
+
+template <class T>
+std::vector<Cuda> partition_space(const Cuda&, std::vector<T>& weights) {
+  static_assert(
+      std::is_arithmetic<T>::value,
+      "Kokkos Error: partitioning arguments must be integers or floats");
+
+  std::vector<Cuda> instances(weights.size());
+  Impl::create_Cuda_instances(instances);
+  return instances;
+}
+}  // namespace Experimental
+
 }  // Namespace Kokkos
 #endif
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
index d892a893b330772ec5e4306ed20a44f8aa2369f1..4b01798f5e2cad495c897b8110d96eec87fe429f 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -167,7 +167,7 @@ inline void configure_shmem_preference(KernelFuncPtr const& func,
 #ifndef KOKKOS_ARCH_KEPLER
   // On Kepler the L1 has no benefit since it doesn't cache reads
   auto set_cache_config = [&] {
-    CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
         func,
         (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
     return prefer_shmem;
@@ -372,14 +372,15 @@ struct CudaParallelLaunchKernelInvoker<
       params.kernelParams   = (void**)args;
       params.extra          = nullptr;
 
-      CUDA_SAFE_CALL(cudaGraphAddKernelNode(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphAddKernelNode(
           &graph_node, graph, /* dependencies = */ nullptr,
           /* numDependencies = */ 0, &params));
     } else {
       // We still need an empty node for the dependency structure
-      CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&graph_node, graph,
-                                           /* dependencies = */ nullptr,
-                                           /* numDependencies = */ 0));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          cudaGraphAddEmptyNode(&graph_node, graph,
+                                /* dependencies = */ nullptr,
+                                /* numDependencies = */ 0));
     }
     KOKKOS_ENSURES(bool(graph_node))
   }
@@ -475,14 +476,15 @@ struct CudaParallelLaunchKernelInvoker<
       params.kernelParams   = (void**)args;
       params.extra          = nullptr;
 
-      CUDA_SAFE_CALL(cudaGraphAddKernelNode(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphAddKernelNode(
           &graph_node, graph, /* dependencies = */ nullptr,
           /* numDependencies = */ 0, &params));
     } else {
       // We still need an empty node for the dependency structure
-      CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&graph_node, graph,
-                                           /* dependencies = */ nullptr,
-                                           /* numDependencies = */ 0));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          cudaGraphAddEmptyNode(&graph_node, graph,
+                                /* dependencies = */ nullptr,
+                                /* numDependencies = */ 0));
     }
     KOKKOS_ENSURES(bool(graph_node))
   }
@@ -538,7 +540,8 @@ struct CudaParallelLaunchKernelInvoker<
                             dim3 const& block, int shmem,
                             CudaInternal const* cuda_instance) {
     // Wait until the previous kernel that uses the constant buffer is done
-    CUDA_SAFE_CALL(cudaEventSynchronize(cuda_instance->constantMemReusable));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaEventSynchronize(cuda_instance->constantMemReusable));
 
     // Copy functor (synchronously) to staging buffer in pinned host memory
     unsigned long* staging = cuda_instance->constantMemHostStaging;
@@ -554,8 +557,9 @@ struct CudaParallelLaunchKernelInvoker<
          get_kernel_func())<<<grid, block, shmem, cuda_instance->m_stream>>>();
 
     // Record an event that says when the constant buffer can be reused
-    CUDA_SAFE_CALL(cudaEventRecord(cuda_instance->constantMemReusable,
-                                   cudaStream_t(cuda_instance->m_stream)));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaEventRecord(cuda_instance->constantMemReusable,
+                        cudaStream_t(cuda_instance->m_stream)));
   }
 
 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
@@ -637,8 +641,9 @@ struct CudaParallelLaunchImpl<
       base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance);
 
 #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
-      CUDA_SAFE_CALL(cudaGetLastError());
-      cuda_instance->fence();
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
+      cuda_instance->fence(
+          "Kokkos::Impl::launch_kernel: Debug Only Check for Execution Error");
 #endif
     }
   }
@@ -650,7 +655,7 @@ struct CudaParallelLaunchImpl<
     // the code and the result is visible.
     auto wrap_get_attributes = []() -> cudaFuncAttributes {
       cudaFuncAttributes attr_tmp;
-      CUDA_SAFE_CALL(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
           cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
       return attr_tmp;
     };
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
index ff31649544033b773519152ca25a22494fdd2f5f..1f3024f3186a14d847a6999b995832e7782b62e9 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
@@ -81,22 +81,34 @@ namespace Impl {
 CudaLockArrays g_host_cuda_lock_arrays = {nullptr, nullptr, 0};
 
 void initialize_host_cuda_lock_arrays() {
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+  desul::Impl::init_lock_arrays();
+
+  DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
+#endif
   if (g_host_cuda_lock_arrays.atomic != nullptr) return;
-  CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.atomic,
-                            sizeof(int) * (CUDA_SPACE_ATOMIC_MASK + 1)));
-  CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.scratch,
-                            sizeof(int) * (Cuda::concurrency())));
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
+      cudaMalloc(&g_host_cuda_lock_arrays.atomic,
+                 sizeof(int) * (CUDA_SPACE_ATOMIC_MASK + 1)));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.scratch,
+                                        sizeof(int) * (Cuda::concurrency())));
+  Impl::cuda_device_synchronize(
+      "Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays");
   g_host_cuda_lock_arrays.n = Cuda::concurrency();
   KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
   init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256,
                                   256>>>();
   init_lock_array_kernel_threadid<<<(Kokkos::Cuda::concurrency() + 255) / 256,
                                     256>>>(Kokkos::Cuda::concurrency());
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  Impl::cuda_device_synchronize(
+      "Kokkos::Impl::initialize_host_cuda_lock_arrays: Post Init Lock Arrays");
 }
 
 void finalize_host_cuda_lock_arrays() {
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+  desul::Impl::finalize_lock_arrays();
+#endif
+
   if (g_host_cuda_lock_arrays.atomic == nullptr) return;
   cudaFree(g_host_cuda_lock_arrays.atomic);
   g_host_cuda_lock_arrays.atomic = nullptr;
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
index 7640b8084d16a210408deb94a35f8962dfc92c99..04fb7cb345a27e9d9932d188216d5261d8606939 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
@@ -53,6 +53,10 @@
 
 #include <Cuda/Kokkos_Cuda_Error.hpp>
 
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#include <desul/atomics/Lock_Array_Cuda.hpp>
+#endif
+
 namespace Kokkos {
 namespace Impl {
 
@@ -150,13 +154,14 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
 }  // namespace
 }  // namespace Impl
 }  // namespace Kokkos
+
 /* Dan Ibanez: it is critical that this code be a macro, so that it will
    capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays!
    putting this in an inline function will NOT do the right thing! */
 #define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()                      \
   {                                                                   \
     if (::Kokkos::Impl::lock_array_copied == 0) {                     \
-      CUDA_SAFE_CALL(                                                 \
+      KOKKOS_IMPL_CUDA_SAFE_CALL(                                     \
           cudaMemcpyToSymbol(Kokkos::Impl::g_device_cuda_lock_arrays, \
                              &Kokkos::Impl::g_host_cuda_lock_arrays,  \
                              sizeof(Kokkos::Impl::CudaLockArrays)));  \
@@ -164,6 +169,8 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
     lock_array_copied = 1;                                            \
   }
 
+#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+
 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
 #define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
 #else
@@ -171,6 +178,19 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
   KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
 #endif
 
+#else
+
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
+#else
+// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
+  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()         \
+  DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
+#endif
+
+#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */
+
 #endif /* defined( KOKKOS_ENABLE_CUDA ) */
 
 #endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
index 2834e6f3de012b718ae06ebb6f87d7d24e3e5756..f83b43e608855492f9d4df725533a08184f5edaf 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
@@ -62,7 +62,6 @@
 #include <Cuda/Kokkos_Cuda_Locks.hpp>
 #include <Cuda/Kokkos_Cuda_Team.hpp>
 #include <Kokkos_Vectorization.hpp>
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
 
 #include <impl/Kokkos_Tools.hpp>
 #include <typeinfo>
@@ -240,9 +239,11 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
 
   //----------------------------------------
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   KOKKOS_DEPRECATED inline int vector_length() const {
     return impl_vector_length();
   }
+#endif
   inline int impl_vector_length() const { return m_vector_length; }
   inline int team_size() const { return m_team_size; }
   inline int league_size() const { return m_league_size; }
@@ -687,6 +688,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   int m_shmem_size;
   void* m_scratch_ptr[2];
   int m_scratch_size[2];
+  int m_scratch_pool_id = -1;
 
   template <class TagType>
   __device__ inline
@@ -797,15 +799,19 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // Functor's reduce memory, team scan memory, and team shared memory depend
     // upon team size.
     m_scratch_ptr[0] = nullptr;
-    m_scratch_ptr[1] =
-        m_team_size <= 0
-            ? nullptr
-            : m_policy.space()
-                  .impl_internal_space_instance()
-                  ->resize_team_scratch_space(
-                      static_cast<ptrdiff_t>(m_scratch_size[1]) *
-                      static_cast<ptrdiff_t>(Cuda::concurrency() /
-                                             (m_team_size * m_vector_size)));
+    if (m_team_size <= 0) {
+      m_scratch_ptr[1] = nullptr;
+    } else {
+      auto scratch_ptr_id =
+          m_policy.space()
+              .impl_internal_space_instance()
+              ->resize_team_scratch_space(
+                  static_cast<std::int64_t>(m_scratch_size[1]) *
+                  (static_cast<std::int64_t>(Cuda::concurrency() /
+                                             (m_team_size * m_vector_size))));
+      m_scratch_ptr[1]  = scratch_ptr_id.first;
+      m_scratch_pool_id = scratch_ptr_id.second;
+    }
 
     const int shmem_size_total = m_shmem_begin + m_shmem_size;
     if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
@@ -829,6 +835,14 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
           "Kokkos::Impl::ParallelFor< Cuda > requested too large team size."));
     }
   }
+
+  ~ParallelFor() {
+    if (m_scratch_pool_id >= 0) {
+      m_policy.space()
+          .impl_internal_space_instance()
+          ->m_team_scratch_pool[m_scratch_pool_id] = 0;
+    }
+  }
 };
 
 }  // namespace Impl
@@ -870,9 +884,24 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   using value_type     = typename ValueTraits::value_type;
   using reference_type = typename ValueTraits::reference_type;
   using functor_type   = FunctorType;
-  using size_type      = Kokkos::Cuda::size_type;
-  using index_type     = typename Policy::index_type;
-  using reducer_type   = ReducerType;
+  // Conditionally set word_size_type to int16_t or int8_t if value_type is
+  // smaller than int32_t (Kokkos::Cuda::size_type)
+  // word_size_type is used to determine the word count, shared memory buffer
+  // size, and global memory buffer size before the reduction is performed.
+  // Within the reduction, the word count is recomputed based on word_size_type
+  // and when calculating indexes into the shared/global memory buffers for
+  // performing the reduction, word_size_type is used again.
+  // For scalars > 4 bytes in size, indexing into shared/global memory relies
+  // on the block and grid dimensions to ensure that we index at the correct
+  // offset rather than at every 4 byte word; such that, when the join is
+  // performed, we have the correct data that was copied over in chunks of 4
+  // bytes.
+  using word_size_type = typename std::conditional<
+      sizeof(value_type) < sizeof(Kokkos::Cuda::size_type),
+      typename std::conditional<sizeof(value_type) == 2, int16_t, int8_t>::type,
+      Kokkos::Cuda::size_type>::type;
+  using index_type   = typename Policy::index_type;
+  using reducer_type = ReducerType;
 
   // Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
   // blockDim.z == 1
@@ -883,9 +912,11 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   const pointer_type m_result_ptr;
   const bool m_result_ptr_device_accessible;
   const bool m_result_ptr_host_accessible;
-  size_type* m_scratch_space;
-  size_type* m_scratch_flags;
-  size_type* m_unified_space;
+  word_size_type* m_scratch_space;
+  // m_scratch_flags must be of type Cuda::size_type due to use of atomics
+  // for tracking metadata in Kokkos_Cuda_ReduceScan.hpp
+  Cuda::size_type* m_scratch_flags;
+  word_size_type* m_unified_space;
 
   // Shall we use the shfl based reduction or not (only use it for static sized
   // types of more than 128bit)
@@ -924,16 +955,16 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
       __device__ inline
       void run(const DummySHMEMReductionType& ) const
       {*/
-    const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
-                                                   sizeof(size_type)>
+    const integral_nonzero_constant<
+        word_size_type, ValueTraits::StaticValueSize / sizeof(word_size_type)>
         word_count(ValueTraits::value_size(
                        ReducerConditional::select(m_functor, m_reducer)) /
-                   sizeof(size_type));
+                   sizeof(word_size_type));
 
     {
       reference_type value =
           ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                          kokkos_impl_cuda_shared_memory<size_type>() +
+                          kokkos_impl_cuda_shared_memory<word_size_type>() +
                               threadIdx.y * word_count.value);
 
       // Number of blocks is bounded so that the reduction can be limited to two
@@ -958,11 +989,12 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
       // This is the final block with the final result at the final threads'
       // location
 
-      size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() +
-                                (blockDim.y - 1) * word_count.value;
-      size_type* const global =
+      word_size_type* const shared =
+          kokkos_impl_cuda_shared_memory<word_size_type>() +
+          (blockDim.y - 1) * word_count.value;
+      word_size_type* const global =
           m_result_ptr_device_accessible
-              ? reinterpret_cast<size_type*>(m_result_ptr)
+              ? reinterpret_cast<word_size_type*>(m_result_ptr)
               : (m_unified_space ? m_unified_space : m_scratch_space);
 
       if (threadIdx.y == 0) {
@@ -985,17 +1017,17 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
         if (cuda_single_inter_block_reduce_scan<false, ReducerTypeFwd,
                                                 WorkTagFwd>(
                 ReducerConditional::select(m_functor, m_reducer), blockIdx.x,
-                gridDim.x, kokkos_impl_cuda_shared_memory<size_type>(),
+                gridDim.x, kokkos_impl_cuda_shared_memory<word_size_type>(),
                 m_scratch_space, m_scratch_flags)) {
           // This is the final block with the final result at the final threads'
           // location
 
-          size_type* const shared =
-              kokkos_impl_cuda_shared_memory<size_type>() +
+          word_size_type* const shared =
+              kokkos_impl_cuda_shared_memory<word_size_type>() +
               (blockDim.y - 1) * word_count.value;
-          size_type* const global =
+          word_size_type* const global =
               m_result_ptr_device_accessible
-                  ? reinterpret_cast<size_type*>(m_result_ptr)
+                  ? reinterpret_cast<word_size_type*>(m_result_ptr)
                   : (m_unified_space ? m_unified_space : m_scratch_space);
 
           if (threadIdx.y == 0) {
@@ -1100,15 +1132,21 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
 
       KOKKOS_ASSERT(block_size > 0);
 
-      m_scratch_space = cuda_internal_scratch_space(
+      // TODO: down casting these uses more space than required?
+      m_scratch_space = (word_size_type*)cuda_internal_scratch_space(
           m_policy.space(), ValueTraits::value_size(ReducerConditional::select(
                                 m_functor, m_reducer)) *
                                 block_size /* block_size == max block_count */);
-      m_scratch_flags =
-          cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type));
-      m_unified_space = cuda_internal_scratch_unified(
-          m_policy.space(), ValueTraits::value_size(ReducerConditional::select(
-                                m_functor, m_reducer)));
+
+      // Intentionally do not downcast to word_size_type since we use Cuda
+      // atomics in Kokkos_Cuda_ReduceScan.hpp
+      m_scratch_flags = cuda_internal_scratch_flags(m_policy.space(),
+                                                    sizeof(Cuda::size_type));
+      m_unified_space =
+          reinterpret_cast<word_size_type*>(cuda_internal_scratch_unified(
+              m_policy.space(),
+              ValueTraits::value_size(
+                  ReducerConditional::select(m_functor, m_reducer))));
 
       // REQUIRED ( 1 , N , 1 )
       dim3 block(1, block_size, 1);
@@ -1139,7 +1177,9 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
           false);  // copy to device and execute
 
       if (!m_result_ptr_device_accessible) {
-        m_policy.space().fence();
+        m_policy.space().fence(
+            "Kokkos::Impl::ParallelReduce<Cuda, RangePolicy>::execute: Result "
+            "Not Device Accessible");
 
         if (m_result_ptr) {
           if (m_unified_space) {
@@ -1459,7 +1499,9 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
           false);  // copy to device and execute
 
       if (!m_result_ptr_device_accessible) {
-        m_policy.space().fence();
+        m_policy.space().fence(
+            "Kokkos::Impl::ParallelReduce<Cuda, MDRangePolicy>::execute: "
+            "Result Not Device Accessible");
 
         if (m_result_ptr) {
           if (m_unified_space) {
@@ -1580,6 +1622,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   size_type m_shmem_size;
   void* m_scratch_ptr[2];
   int m_scratch_size[2];
+  int m_scratch_pool_id = -1;
   const size_type m_league_size;
   int m_team_size;
   const size_type m_vector_size;
@@ -1821,7 +1864,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
           true);  // copy to device and execute
 
       if (!m_result_ptr_device_accessible) {
-        m_policy.space().fence();
+        m_policy.space().fence(
+            "Kokkos::Impl::ParallelReduce<Cuda, TeamPolicy>::execute: Result "
+            "Not Device Accessible");
 
         if (m_result_ptr) {
           if (m_unified_space) {
@@ -1895,16 +1940,19 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
     m_scratch_size[0] = m_shmem_size;
     m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-    m_scratch_ptr[1] =
-        m_team_size <= 0
-            ? nullptr
-            : m_policy.space()
-                  .impl_internal_space_instance()
-                  ->resize_team_scratch_space(
-                      static_cast<std::int64_t>(m_scratch_size[1]) *
-                      (static_cast<std::int64_t>(
-                          Cuda::concurrency() /
-                          (m_team_size * m_vector_size))));
+    if (m_team_size <= 0) {
+      m_scratch_ptr[1] = nullptr;
+    } else {
+      auto scratch_ptr_id =
+          m_policy.space()
+              .impl_internal_space_instance()
+              ->resize_team_scratch_space(
+                  static_cast<std::int64_t>(m_scratch_size[1]) *
+                  (static_cast<std::int64_t>(Cuda::concurrency() /
+                                             (m_team_size * m_vector_size))));
+      m_scratch_ptr[1]  = scratch_ptr_id.first;
+      m_scratch_pool_id = scratch_ptr_id.second;
+    }
 
     // The global parallel_reduce does not support vector_length other than 1 at
     // the moment
@@ -1973,6 +2021,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     cudaFuncAttributes attr =
         CudaParallelLaunch<ParallelReduce,
                            LaunchBounds>::get_cuda_func_attributes();
+
+    // Valid team size not provided, deduce team size
     m_team_size =
         m_team_size >= 0
             ? m_team_size
@@ -1994,15 +2044,19 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
     m_scratch_size[0] = m_shmem_size;
     m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-    m_scratch_ptr[1] =
-        m_team_size <= 0
-            ? nullptr
-            : m_policy.space()
-                  .impl_internal_space_instance()
-                  ->resize_team_scratch_space(
-                      static_cast<ptrdiff_t>(m_scratch_size[1]) *
-                      static_cast<ptrdiff_t>(Cuda::concurrency() /
-                                             (m_team_size * m_vector_size)));
+    if (m_team_size <= 0) {
+      m_scratch_ptr[1] = nullptr;
+    } else {
+      auto scratch_ptr_id =
+          m_policy.space()
+              .impl_internal_space_instance()
+              ->resize_team_scratch_space(
+                  static_cast<std::int64_t>(m_scratch_size[1]) *
+                  (static_cast<std::int64_t>(Cuda::concurrency() /
+                                             (m_team_size * m_vector_size))));
+      m_scratch_ptr[1]  = scratch_ptr_id.first;
+      m_scratch_pool_id = scratch_ptr_id.second;
+    }
 
     // The global parallel_reduce does not support vector_length other than 1 at
     // the moment
@@ -2030,13 +2084,28 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
     }
-    if (int(m_team_size) >
-        arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) {
+
+    size_type team_size_max =
+        Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
+            m_policy.space().impl_internal_space_instance(), attr, m_functor,
+            m_vector_size, m_policy.team_scratch_size(0),
+            m_policy.thread_scratch_size(0)) /
+        m_vector_size;
+
+    if ((int)m_team_size > (int)team_size_max) {
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too "
                       "large team size."));
     }
   }
+
+  ~ParallelReduce() {
+    if (m_scratch_pool_id >= 0) {
+      m_policy.space()
+          .impl_internal_space_instance()
+          ->m_team_scratch_pool[m_scratch_pool_id] = 0;
+    }
+  }
 };
 
 }  // namespace Impl
@@ -2167,9 +2236,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
 
     for (typename Policy::member_type iwork_base = range.begin();
          iwork_base < range.end(); iwork_base += blockDim.y) {
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      unsigned MASK = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-#endif
+      unsigned MASK                            = __activemask();
       const typename Policy::member_type iwork = iwork_base + threadIdx.y;
 
       __syncthreads();  // Don't overwrite previous iteration values until they
@@ -2182,11 +2249,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
       for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
         shared_data[i + word_count.value] = shared_data[i] = shared_accum[i];
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK);
-#else
-      KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+      __syncwarp(MASK);
       if (CudaTraits::WarpSize < word_count.value) {
         __syncthreads();
       }  // Protect against large scan values.
@@ -2457,9 +2520,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
 
     for (typename Policy::member_type iwork_base = range.begin();
          iwork_base < range.end(); iwork_base += blockDim.y) {
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      unsigned MASK = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-#endif
+      unsigned MASK = __activemask();
 
       const typename Policy::member_type iwork = iwork_base + threadIdx.y;
 
@@ -2474,11 +2535,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
         shared_data[i + word_count.value] = shared_data[i] = shared_accum[i];
       }
 
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK);
-#else
-      KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+      __syncwarp(MASK);
       if (CudaTraits::WarpSize < word_count.value) {
         __syncthreads();
       }  // Protect against large scan values.
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
index fc9fc3770bead16eff4a0b5b6fea8b0a2039200f..e5b05bcc64f183ef98248a239e6b305fae9410ea 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -191,48 +191,28 @@ __device__ bool cuda_inter_block_reduction(
         value_type tmp = Kokkos::shfl_down(value, 1, 32);
         if (id + 1 < int(gridDim.x)) join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-      int active        = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      unsigned int mask = __activemask();
+      int active        = __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 2) {
         value_type tmp = Kokkos::shfl_down(value, 2, 32);
         if (id + 2 < int(gridDim.x)) join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 4) {
         value_type tmp = Kokkos::shfl_down(value, 4, 32);
         if (id + 4 < int(gridDim.x)) join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 8) {
         value_type tmp = Kokkos::shfl_down(value, 8, 32);
         if (id + 8 < int(gridDim.x)) join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 16) {
         value_type tmp = Kokkos::shfl_down(value, 16, 32);
         if (id + 16 < int(gridDim.x)) join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
     }
   }
   // The last block has in its thread=0 the global reduction value through
@@ -388,48 +368,28 @@ __device__ inline
         value_type tmp = Kokkos::shfl_down(value, 1, 32);
         if (id + 1 < int(gridDim.x)) reducer.join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-      int active        = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      unsigned int mask = __activemask();
+      int active        = __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 2) {
         value_type tmp = Kokkos::shfl_down(value, 2, 32);
         if (id + 2 < int(gridDim.x)) reducer.join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 4) {
         value_type tmp = Kokkos::shfl_down(value, 4, 32);
         if (id + 4 < int(gridDim.x)) reducer.join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 8) {
         value_type tmp = Kokkos::shfl_down(value, 8, 32);
         if (id + 8 < int(gridDim.x)) reducer.join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 16) {
         value_type tmp = Kokkos::shfl_down(value, 16, 32);
         if (id + 16 < int(gridDim.x)) reducer.join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
     }
   }
 
@@ -573,23 +533,17 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
                                // part of the reduction
       const int width)         // How much of the warp participates
   {
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
     unsigned mask =
         width == 32
             ? 0xffffffff
             : ((1 << width) - 1)
                   << ((threadIdx.y * blockDim.x + threadIdx.x) / width) * width;
-#endif
     const int lane_id = (threadIdx.y * blockDim.x + threadIdx.x) % 32;
     for (int delta = skip_vector ? blockDim.x : 1; delta < width; delta *= 2) {
       if (lane_id + delta < 32) {
         ValueJoin::join(functor, value, value + delta);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      KOKKOS_IMPL_CUDA_SYNCWARP_MASK(mask);
-#else
-      KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+      __syncwarp(mask);
     }
     *value = *(value - lane_id);
   }
@@ -612,17 +566,18 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
       const unsigned int delta = (threadIdx.y * blockDim.x + threadIdx.x) * 32;
       if (delta < blockDim.x * blockDim.y)
         *my_shared_team_buffer_element = shared_team_buffer_element[delta];
-      KOKKOS_IMPL_CUDA_SYNCWARP;
+      __syncwarp(0xffffffff);
       scalar_intra_warp_reduction(functor, my_shared_team_buffer_element, false,
                                   blockDim.x * blockDim.y / 32);
       if (threadIdx.x + threadIdx.y == 0) *result = *shared_team_buffer_element;
     }
   }
 
+  template <class SizeType = Cuda::size_type>
   __device__ static inline bool scalar_inter_block_reduction(
       const FunctorType& functor, const Cuda::size_type /*block_id*/,
-      const Cuda::size_type block_count, Cuda::size_type* const shared_data,
-      Cuda::size_type* const global_data, Cuda::size_type* const global_flags) {
+      const Cuda::size_type block_count, SizeType* const shared_data,
+      SizeType* const global_data, Cuda::size_type* const global_flags) {
     Scalar* const global_team_buffer_element = ((Scalar*)global_data);
     Scalar* const my_global_team_buffer_element =
         global_team_buffer_element + blockIdx.x;
@@ -713,17 +668,17 @@ __device__ void cuda_intra_block_reduce_scan(
   const pointer_type tdata_intra = base_data + value_count * threadIdx.y;
 
   {  // Intra-warp reduction:
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 0)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 1)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 2)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 3)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 4)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
   }
 
   __syncthreads();  // Wait for all warps to reduce
@@ -732,57 +687,31 @@ __device__ void cuda_intra_block_reduce_scan(
     const unsigned rtid_inter = (threadIdx.y ^ BlockSizeMask)
                                 << CudaTraits::WarpIndexShift;
 
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    unsigned inner_mask =
-        KOKKOS_IMPL_CUDA_BALLOT_MASK(0xffffffff, (rtid_inter < blockDim.y));
-#endif
+    unsigned inner_mask = __ballot_sync(0xffffffff, (rtid_inter < blockDim.y));
     if (rtid_inter < blockDim.y) {
       const pointer_type tdata_inter =
           base_data + value_count * (rtid_inter ^ BlockSizeMask);
 
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
       if ((1 << 5) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 5)
       }
       if ((1 << 6) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 6)
       }
       if ((1 << 7) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 7)
       }
       if ((1 << 8) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 8)
       }
       if ((1 << 9) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 9)
       }
-#else
-      if ((1 << 5) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 5)
-      }
-      if ((1 << 6) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 6)
-      }
-      if ((1 << 7) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 7)
-      }
-      if ((1 << 8) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 8)
-      }
-      if ((1 << 9) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 9)
-      }
-#endif
 
       if (DoScan) {
         int n =
@@ -795,25 +724,14 @@ __device__ void cuda_intra_block_reduce_scan(
 
         if (!(rtid_inter + n < blockDim.y)) n = 0;
 
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
-        BLOCK_SCAN_STEP(tdata_inter, n, 8)
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
-        BLOCK_SCAN_STEP(tdata_inter, n, 7)
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
-        BLOCK_SCAN_STEP(tdata_inter, n, 6)
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
-        BLOCK_SCAN_STEP(tdata_inter, n, 5)
-#else
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(inner_mask);
         BLOCK_SCAN_STEP(tdata_inter, n, 8)
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(inner_mask);
         BLOCK_SCAN_STEP(tdata_inter, n, 7)
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(inner_mask);
         BLOCK_SCAN_STEP(tdata_inter, n, 6)
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(inner_mask);
         BLOCK_SCAN_STEP(tdata_inter, n, 5)
-#endif
       }
     }
   }
@@ -832,17 +750,17 @@ __device__ void cuda_intra_block_reduce_scan(
                                               : ((rtid_intra & 16) ? 16 : 0))));
 
     if (!(rtid_intra + n < blockDim.y)) n = 0;
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_SCAN_STEP(tdata_intra, n, 4) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_SCAN_STEP(tdata_intra, n, 3) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_SCAN_STEP(tdata_intra, n, 2) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_SCAN_STEP(tdata_intra, n, 1) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_SCAN_STEP(tdata_intra, n, 0) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
   }
 
 #undef BLOCK_SCAN_STEP
@@ -858,12 +776,13 @@ __device__ void cuda_intra_block_reduce_scan(
  *  Global reduce result is in the last threads' 'shared_data' location.
  */
 
-template <bool DoScan, class FunctorType, class ArgTag>
+template <bool DoScan, class FunctorType, class ArgTag,
+          class SizeType = Cuda::size_type>
 __device__ bool cuda_single_inter_block_reduce_scan2(
     const FunctorType& functor, const Cuda::size_type block_id,
-    const Cuda::size_type block_count, Cuda::size_type* const shared_data,
-    Cuda::size_type* const global_data, Cuda::size_type* const global_flags) {
-  using size_type   = Cuda::size_type;
+    const Cuda::size_type block_count, SizeType* const shared_data,
+    SizeType* const global_data, Cuda::size_type* const global_flags) {
+  using size_type   = SizeType;
   using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>;
   using ValueJoin   = FunctorValueJoin<FunctorType, ArgTag>;
   using ValueInit   = FunctorValueInit<FunctorType, ArgTag>;
@@ -953,11 +872,12 @@ __device__ bool cuda_single_inter_block_reduce_scan2(
   return is_last_block;
 }
 
-template <bool DoScan, class FunctorType, class ArgTag>
+template <bool DoScan, class FunctorType, class ArgTag,
+          class SizeType = Cuda::size_type>
 __device__ bool cuda_single_inter_block_reduce_scan(
     const FunctorType& functor, const Cuda::size_type block_id,
-    const Cuda::size_type block_count, Cuda::size_type* const shared_data,
-    Cuda::size_type* const global_data, Cuda::size_type* const global_flags) {
+    const Cuda::size_type block_count, SizeType* const shared_data,
+    SizeType* const global_data, Cuda::size_type* const global_flags) {
   using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>;
   if (!DoScan && ValueTraits::StaticValueSize > 0)
     return Kokkos::Impl::CudaReductionsFunctor<
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
index 2004edbeacdb4b5b309ea3bd6eb83b3abcfacea6..88ac0d1878a911a876210fe06cc52fa1d8285be6 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
@@ -54,11 +54,27 @@
 #include <Kokkos_Core_fwd.hpp>
 
 #include <impl/Kokkos_TaskBase.hpp>
-#include <Cuda/Kokkos_Cuda_Error.hpp>  // CUDA_SAFE_CALL
+#include <Cuda/Kokkos_Cuda_Error.hpp>  // KOKKOS_IMPL_CUDA_SAFE_CALL
 #include <impl/Kokkos_TaskTeamMember.hpp>
 
 //----------------------------------------------------------------------------
 
+#if defined(__CUDA_ARCH__)
+#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG)                           \
+  {                                                                        \
+    __syncwarp();                                                          \
+    const unsigned b = __activemask();                                     \
+    if (b != 0xffffffff) {                                                 \
+      printf(" SYNCWARP AT %s (%d,%d,%d) (%d,%d,%d) failed %x\n", MSG,     \
+             blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, \
+             threadIdx.z, b);                                              \
+      return;                                                              \
+    }                                                                      \
+  }
+#else
+#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG)
+#endif
+
 namespace Kokkos {
 namespace Impl {
 namespace {
@@ -138,13 +154,13 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
       // Broadcast task pointer:
 
       // Sync before the broadcast
-      KOKKOS_IMPL_CUDA_SYNCWARP;
+      __syncwarp(0xffffffff);
 
       // pretend it's an int* for shuffle purposes
       ((int*)&current_task)[0] =
-          KOKKOS_IMPL_CUDA_SHFL(((int*)&current_task)[0], 0, 32);
+          __shfl_sync(0xffffffff, ((int*)&current_task)[0], 0, 32);
       ((int*)&current_task)[1] =
-          KOKKOS_IMPL_CUDA_SHFL(((int*)&current_task)[1], 0, 32);
+          __shfl_sync(0xffffffff, ((int*)&current_task)[1], 0, 32);
 
       if (current_task) {
         KOKKOS_ASSERT(!current_task->as_runnable_task().get_respawn_flag());
@@ -168,7 +184,7 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
 
         // Synchronize threads of the warp and insure memory
         // writes are visible to all threads in the warp.
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);
 
         if (shared_memory_task_copy->is_team_runnable()) {
           // Thread Team Task
@@ -182,7 +198,7 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
         // Synchronize threads of the warp and insure memory
         // writes are visible to all threads in the warp.
 
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);
 
         // if(warp_lane < b % CudaTraits::WarpSize) b += CudaTraits::WarpSize;
         // b -= b % CudaTraits::WarpSize;
@@ -196,7 +212,7 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
         // writes are visible to root thread of the warp for
         // respawn or completion.
 
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);
 
         if (warp_lane == 0) {
           // If respawn requested copy respawn data back to main memory
@@ -249,12 +265,14 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
 
     auto& queue = scheduler.queue();
 
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::"
+        "Cuda>::execute: Pre Task Execution");
 
     // Query the stack size, in bytes:
 
     size_t previous_stack_size = 0;
-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
         cudaDeviceGetLimit(&previous_stack_size, cudaLimitStackSize));
 
     // If not large enough then set the stack size, in bytes:
@@ -262,18 +280,21 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
     const size_t larger_stack_size = 1 << 11;
 
     if (previous_stack_size < larger_stack_size) {
-      CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
     }
 
     cuda_task_queue_execute<<<grid, block, shared_total, stream>>>(
         scheduler, shared_per_warp);
 
-    CUDA_SAFE_CALL(cudaGetLastError());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
 
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::"
+        "Cuda>::execute: Post Task Execution");
 
     if (previous_stack_size < larger_stack_size) {
-      CUDA_SAFE_CALL(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
           cudaDeviceSetLimit(cudaLimitStackSize, previous_stack_size));
     }
   }
@@ -295,13 +316,17 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
     destroy_type* dtor_ptr =
         (destroy_type*)((char*)storage + sizeof(function_type));
 
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::"
+        "Cuda>::execute: Pre Get Function Pointer for Tasks");
 
     set_cuda_task_base_apply_function_pointer<TaskType>
         <<<1, 1>>>(ptr_ptr, dtor_ptr);
 
-    CUDA_SAFE_CALL(cudaGetLastError());
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::"
+        "Cuda>::execute: Post Get Function Pointer for Tasks");
 
     ptr  = *ptr_ptr;
     dtor = *dtor_ptr;
@@ -372,23 +397,20 @@ class TaskQueueSpecializationConstrained<
           // count of 0 also. Otherwise, returns a task from another queue
           // or `end` if one couldn't be popped
           task_ptr = team_queue.attempt_to_steal_task();
-#if 0
-          if(task != no_more_tasks_sentinel && task != end) {
-            std::printf("task stolen on rank %d\n", team_exec.league_rank());
-          }
-#endif
         }
       }
 
       // Synchronize warp with memory fence before broadcasting task pointer:
 
       // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "A" );
-      KOKKOS_IMPL_CUDA_SYNCWARP;
+      __syncwarp(0xffffffff);
 
       // Broadcast task pointer:
 
-      ((int*)&task_ptr)[0] = KOKKOS_IMPL_CUDA_SHFL(((int*)&task_ptr)[0], 0, 32);
-      ((int*)&task_ptr)[1] = KOKKOS_IMPL_CUDA_SHFL(((int*)&task_ptr)[1], 0, 32);
+      ((int*)&task_ptr)[0] =
+          __shfl_sync(0xffffffff, ((int*)&task_ptr)[0], 0, 32);
+      ((int*)&task_ptr)[1] =
+          __shfl_sync(0xffffffff, ((int*)&task_ptr)[1], 0, 32);
 
 #if defined(KOKKOS_ENABLE_DEBUG)
       KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN("TaskQueue CUDA task_ptr");
@@ -418,7 +440,7 @@ class TaskQueueSpecializationConstrained<
         // writes are visible to all threads in the warp.
 
         // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "B" );
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);
 
         if (task_root_type::TaskTeam == task_shmem->m_task_type) {
           // Thread Team Task
@@ -432,7 +454,7 @@ class TaskQueueSpecializationConstrained<
         // writes are visible to all threads in the warp.
 
         // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "C" );
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);
 
         // copy task closure from shared to global memory:
 
@@ -445,7 +467,7 @@ class TaskQueueSpecializationConstrained<
         // respawn or completion.
 
         // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "D" );
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);
 
         // If respawn requested copy respawn data back to main memory
 
@@ -475,12 +497,14 @@ class TaskQueueSpecializationConstrained<
     auto& queue = scheduler.queue();
     queue.initialize_team_queues(warps_per_block * grid.x);
 
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<"
+        "Kokkos::Cuda>::execute: Pre Execute Task");
 
     // Query the stack size, in bytes:
 
     size_t previous_stack_size = 0;
-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
         cudaDeviceGetLimit(&previous_stack_size, cudaLimitStackSize));
 
     // If not large enough then set the stack size, in bytes:
@@ -488,18 +512,21 @@ class TaskQueueSpecializationConstrained<
     const size_t larger_stack_size = 2048;
 
     if (previous_stack_size < larger_stack_size) {
-      CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
     }
 
     cuda_task_queue_execute<<<grid, block, shared_total, stream>>>(
         scheduler, shared_per_warp);
 
-    CUDA_SAFE_CALL(cudaGetLastError());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
 
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<"
+        "Kokkos::Cuda>::execute: Post Execute Task");
 
     if (previous_stack_size < larger_stack_size) {
-      CUDA_SAFE_CALL(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
           cudaDeviceSetLimit(cudaLimitStackSize, previous_stack_size));
     }
   }
@@ -516,13 +543,17 @@ class TaskQueueSpecializationConstrained<
     destroy_type* dtor_ptr =
         (destroy_type*)((char*)storage + sizeof(function_type));
 
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<"
+        "Kokkos::Cuda>::get_function_pointer: Pre Get Function Pointer");
 
     set_cuda_task_base_apply_function_pointer<TaskType>
         <<<1, 1>>>(ptr_ptr, dtor_ptr);
 
-    CUDA_SAFE_CALL(cudaGetLastError());
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<"
+        "Kokkos::Cuda>::get_function_pointer: Post Get Function Pointer");
 
     ptr  = *ptr_ptr;
     dtor = *dtor_ptr;
@@ -609,7 +640,7 @@ class TaskExec<Kokkos::Cuda, Scheduler> {
 
   __device__ void team_barrier() const {
     if (1 < m_team_size) {
-      KOKKOS_IMPL_CUDA_SYNCWARP;
+      __syncwarp(0xffffffff);
     }
   }
 
@@ -1205,5 +1236,7 @@ KOKKOS_INLINE_FUNCTION void single(
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
+#undef KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN
+
 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
 #endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
index e7806390155d46fd811a21432d9f9d268c457468..922b980a2545b4e35d573d44806d76fdf1ca1ea2 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
@@ -340,191 +340,6 @@ class CudaTeamMember {
 #endif
   }
 
-  //--------------------------------------------------------------------------
-  /**\brief  Global reduction across all blocks
-   *
-   *  Return !0 if reducer contains the final value
-   */
-  template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
-      typename std::enable_if<is_reducer<ReducerType>::value, int>::type
-      global_reduce(ReducerType const& reducer, int* const global_scratch_flags,
-                    void* const global_scratch_space, void* const shmem,
-                    int const shmem_size) {
-#ifdef __CUDA_ARCH__
-
-    using value_type   = typename ReducerType::value_type;
-    using pointer_type = value_type volatile*;
-
-    // Number of shared memory entries for the reduction:
-    const int nsh = shmem_size / sizeof(value_type);
-
-    // Number of CUDA threads in the block, rank within the block
-    const int nid = blockDim.x * blockDim.y * blockDim.z;
-    const int tid =
-        threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
-
-    // Reduces within block using all available shared memory
-    // Contributes if it is the root "vector lane"
-
-    // wn == number of warps in the block
-    // wx == which lane within the warp
-    // wy == which warp within the block
-
-    const int wn =
-        (nid + CudaTraits::WarpIndexMask) >> CudaTraits::WarpIndexShift;
-    const int wx = tid & CudaTraits::WarpIndexMask;
-    const int wy = tid >> CudaTraits::WarpIndexShift;
-
-    //------------------------
-    {  // Intra warp shuffle reduction from contributing CUDA threads
-
-      value_type tmp(reducer.reference());
-
-      for (int i = CudaTraits::WarpSize; (int)blockDim.x <= (i >>= 1);) {
-        Impl::in_place_shfl_down(reducer.reference(), tmp, i,
-                                 CudaTraits::WarpSize);
-
-        // Root of each vector lane reduces "thread" contribution
-        if (0 == threadIdx.x && wx < i) {
-          reducer.join(&tmp, reducer.data());
-        }
-      }
-
-      // Reduce across warps using shared memory.
-      // Number of warps may not be power of two.
-
-      __syncthreads();  // Wait before shared data write
-
-      // Number of shared memory entries for the reduction
-      // is at most one per warp
-      const int nentry = wn < nsh ? wn : nsh;
-
-      if (0 == wx && wy < nentry) {
-        // Root thread of warp 'wy' has warp's value to contribute
-        ((value_type*)shmem)[wy] = tmp;
-      }
-
-      __syncthreads();  // Wait for write to be visible to block
-
-      // When more warps than shared entries
-      // then warps must take turns joining their contribution
-      // to the designated shared memory entry.
-      for (int i = nentry; i < wn; i += nentry) {
-        const int k = wy - i;
-
-        if (0 == wx && i <= wy && k < nentry) {
-          // Root thread of warp 'wy' has warp's value to contribute
-          reducer.join(((value_type*)shmem) + k, &tmp);
-        }
-
-        __syncthreads();  // Wait for write to be visible to block
-      }
-
-      // One warp performs the inter-warp reduction:
-
-      if (0 == wy) {
-        // Start fan-in at power of two covering nentry
-
-        for (int i = (1 << (32 - __clz(nentry - 1))); (i >>= 1);) {
-          const int k = wx + i;
-          if (wx < i && k < nentry) {
-            reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + k);
-            __threadfence_block();  // Wait for write to be visible to warp
-          }
-        }
-      }
-    }
-    //------------------------
-    {  // Write block's value to global_scratch_memory
-
-      int last_block = 0;
-
-      if (0 == wx) {
-        reducer.copy(((pointer_type)global_scratch_space) +
-                         blockIdx.x * reducer.length(),
-                     reducer.data());
-
-        __threadfence();  // Wait until global write is visible.
-
-        last_block = (int)gridDim.x ==
-                     1 + Kokkos::atomic_fetch_add(global_scratch_flags, 1);
-
-        // If last block then reset count
-        if (last_block) *global_scratch_flags = 0;
-      }
-
-      last_block = __syncthreads_or(last_block);
-
-      if (!last_block) return 0;
-    }
-    //------------------------
-    // Last block reads global_scratch_memory into shared memory.
-
-    const int nentry = nid < gridDim.x ? (nid < nsh ? nid : nsh)
-                                       : (gridDim.x < nsh ? gridDim.x : nsh);
-
-    // nentry = min( nid , nsh , gridDim.x )
-
-    // whole block reads global memory into shared memory:
-
-    if (tid < nentry) {
-      const int offset = tid * reducer.length();
-
-      reducer.copy(((pointer_type)shmem) + offset,
-                   ((pointer_type)global_scratch_space) + offset);
-
-      for (int i = nentry + tid; i < (int)gridDim.x; i += nentry) {
-        reducer.join(
-            ((pointer_type)shmem) + offset,
-            ((pointer_type)global_scratch_space) + i * reducer.length());
-      }
-    }
-
-    __syncthreads();  // Wait for writes to be visible to block
-
-    if (0 == wy) {
-      // Iterate to reduce shared memory to single warp fan-in size
-
-      const int nreduce =
-          CudaTraits::WarpSize < nentry ? CudaTraits::WarpSize : nentry;
-
-      // nreduce = min( CudaTraits::WarpSize , nsh , gridDim.x )
-
-      if (wx < nreduce && nreduce < nentry) {
-        for (int i = nreduce + wx; i < nentry; i += nreduce) {
-          reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + i);
-        }
-        __threadfence_block();  // Wait for writes to be visible to warp
-      }
-
-      // Start fan-in at power of two covering nentry
-
-      for (int i = (1 << (32 - __clz(nreduce - 1))); (i >>= 1);) {
-        const int k = wx + i;
-        if (wx < i && k < nreduce) {
-          reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + k);
-          __threadfence_block();  // Wait for writes to be visible to warp
-        }
-      }
-
-      if (0 == wx) {
-        reducer.copy(reducer.data(), (pointer_type)shmem);
-        return 1;
-      }
-    }
-    return 0;
-
-#else
-    (void)reducer;
-    (void)global_scratch_flags;
-    (void)global_scratch_space;
-    (void)shmem;
-    (void)shmem_size;
-    return 0;
-#endif
-  }
-
   //----------------------------------------
   // Private for the driver
 
@@ -533,7 +348,7 @@ class CudaTeamMember {
                  void* scratch_level_1_ptr, const int scratch_level_1_size,
                  const int arg_league_rank, const int arg_league_size)
       : m_team_reduce(shared),
-        m_team_shared(((char*)shared) + shared_begin, shared_size,
+        m_team_shared(static_cast<char*>(shared) + shared_begin, shared_size,
                       scratch_level_1_ptr, scratch_level_1_size),
         m_team_reduce_size(shared_begin),
         m_league_rank(arg_league_rank),
@@ -854,14 +669,10 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
        i += blockDim.x) {
     closure(i);
   }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  KOKKOS_IMPL_CUDA_SYNCWARP_MASK(
-      blockDim.x == 32 ? 0xffffffff
-                       : ((1 << blockDim.x) - 1)
-                             << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
-#else
-  KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+  __syncwarp(blockDim.x == 32
+                 ? 0xffffffff
+                 : ((1 << blockDim.x) - 1)
+                       << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
 #endif
 }
 
@@ -1100,14 +911,10 @@ KOKKOS_INLINE_FUNCTION void single(
   (void)lambda;
 #ifdef __CUDA_ARCH__
   if (threadIdx.x == 0) lambda();
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  KOKKOS_IMPL_CUDA_SYNCWARP_MASK(
-      blockDim.x == 32 ? 0xffffffff
-                       : ((1 << blockDim.x) - 1)
-                             << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
-#else
-  KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+  __syncwarp(blockDim.x == 32
+                 ? 0xffffffff
+                 : ((1 << blockDim.x) - 1)
+                       << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
 #endif
 }
 
@@ -1118,14 +925,10 @@ KOKKOS_INLINE_FUNCTION void single(
   (void)lambda;
 #ifdef __CUDA_ARCH__
   if (threadIdx.x == 0 && threadIdx.y == 0) lambda();
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  KOKKOS_IMPL_CUDA_SYNCWARP_MASK(
-      blockDim.x == 32 ? 0xffffffff
-                       : ((1 << blockDim.x) - 1)
-                             << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
-#else
-  KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+  __syncwarp(blockDim.x == 32
+                 ? 0xffffffff
+                 : ((1 << blockDim.x) - 1)
+                       << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
 #endif
 }
 
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
index 7f7b7b6e78adc3de9d5ae446565eedc7d00439f5..31d3c47e1c9c9af3b6c6d8c918abe01dd0b238fe 100644
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
+++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
@@ -48,7 +48,12 @@
 #ifdef KOKKOS_ENABLE_CUDA
 
 #include <type_traits>
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
+
+#if !defined(KOKKOS_COMPILER_CLANG)
+#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(long long)
+#else
+#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(int)
+#endif
 
 namespace Kokkos {
 
@@ -61,7 +66,7 @@ constexpr unsigned shfl_all_mask = 0xffffffffu;
 // Shuffle operations require input to be a register (stack) variable
 
 // Derived implements do_shfl_op(unsigned mask, T& in, int lane, int width),
-// which turns in to one of KOKKOS_IMPL_CUDA_SHFL(_UP_|_DOWN_|_)MASK
+// which turns in to one of __shfl_sync(_up|_down)
 // Since the logic with respect to value sizes, etc., is the same everywhere,
 // put it all in one place.
 template <class Derived>
@@ -157,7 +162,7 @@ struct in_place_shfl_fn : in_place_shfl_op<in_place_shfl_fn> {
     (void)val;
     (void)lane;
     (void)width;
-    return KOKKOS_IMPL_CUDA_SHFL_MASK(mask, val, lane, width);
+    return __shfl_sync(mask, val, lane, width);
   }
 };
 template <class... Args>
@@ -170,7 +175,7 @@ struct in_place_shfl_up_fn : in_place_shfl_op<in_place_shfl_up_fn> {
   __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val,
                                                   int lane, int width) const
       noexcept {
-    return KOKKOS_IMPL_CUDA_SHFL_UP_MASK(mask, val, lane, width);
+    return __shfl_up_sync(mask, val, lane, width);
   }
 };
 template <class... Args>
@@ -188,7 +193,7 @@ struct in_place_shfl_down_fn : in_place_shfl_op<in_place_shfl_down_fn> {
     (void)val;
     (void)lane;
     (void)width;
-    return KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(mask, val, lane, width);
+    return __shfl_down_sync(mask, val, lane, width);
   }
 };
 template <class... Args>
@@ -228,5 +233,7 @@ __device__ inline T shfl_up(const T& val, int delta, int width,
 
 }  // end namespace Kokkos
 
+#undef KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF
+
 #endif  // defined( KOKKOS_ENABLE_CUDA )
 #endif  // !defined( KOKKOS_CUDA_VECTORIZATION_HPP )
diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp
deleted file mode 100644
index 0cdd84ce27157e118065c6fbcf2da71a875b81e0..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-#include <Kokkos_Macros.hpp>
-
-#if defined(__CUDA_ARCH__)
-#define KOKKOS_IMPL_CUDA_ACTIVEMASK __activemask()
-#define KOKKOS_IMPL_CUDA_SYNCWARP __syncwarp(0xffffffff)
-#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) __syncwarp(m)
-#define KOKKOS_IMPL_CUDA_BALLOT(x) __ballot_sync(__activemask(), x)
-#define KOKKOS_IMPL_CUDA_BALLOT_MASK(m, x) __ballot_sync(m, x)
-#define KOKKOS_IMPL_CUDA_SHFL(x, y, z) __shfl_sync(0xffffffff, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_MASK(m, x, y, z) __shfl_sync(m, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_UP(x, y, z) __shfl_up_sync(0xffffffff, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_UP_MASK(m, x, y, z) __shfl_up_sync(m, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_DOWN(x, y, z) \
-  __shfl_down_sync(0xffffffff, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m, x, y, z) __shfl_down_sync(m, x, y, z)
-#else
-#define KOKKOS_IMPL_CUDA_ACTIVEMASK 0
-#define KOKKOS_IMPL_CUDA_SYNCWARP
-#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) (void)m
-#define KOKKOS_IMPL_CUDA_BALLOT(x) 0
-#define KOKKOS_IMPL_CUDA_BALLOT_MASK(m, x) 0
-#define KOKKOS_IMPL_CUDA_SHFL(x, y, z) 0
-#define KOKKOS_IMPL_CUDA_SHFL_MASK(m, x, y, z) 0
-#define KOKKOS_IMPL_CUDA_SHFL_UP(x, y, z) 0
-#define KOKKOS_IMPL_CUDA_SHFL_DOWN(x, y, z) 0
-#define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m, x, y, z) 0
-#endif
-
-#if !defined(KOKKOS_COMPILER_CLANG)
-#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(long long)
-#else
-#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(int)
-#endif
-
-#if defined(__CUDA_ARCH__)
-#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG)                           \
-  {                                                                        \
-    __syncwarp();                                                          \
-    const unsigned b = __activemask();                                     \
-    if (b != 0xffffffff) {                                                 \
-      printf(" SYNCWARP AT %s (%d,%d,%d) (%d,%d,%d) failed %x\n", MSG,     \
-             blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, \
-             threadIdx.z, b);                                              \
-      return;                                                              \
-    }                                                                      \
-  }
-#else
-#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG)
-#endif
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp
index 9278d1bdc9efcc2a76183085c974afef41413e3c..7eb3e1e9f70fe4cf724e3b766e38ebc16b3c7c8f 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp
@@ -45,6 +45,7 @@
 #ifndef KOKKOS_HIP_BLOCKSIZE_DEDUCTION_HPP
 #define KOKKOS_HIP_BLOCKSIZE_DEDUCTION_HPP
 
+#include <functional>
 #include <Kokkos_Macros.hpp>
 
 #if defined(__HIPCC__)
@@ -56,118 +57,239 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
-template <typename DriverType, bool, int MaxThreadsPerBlock, int MinBlocksPerSM>
-void hipOccupancy(int *numBlocks, int blockSize, int sharedmem) {
-  // FIXME_HIP - currently the "constant" path is unimplemented.
-  //             we should look at whether it's functional, and
-  //             perform some simple scaling studies to see when /
-  //             if the constant launcher outperforms the current
-  //             pass by pointer shared launcher
-  HIP_SAFE_CALL(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-      numBlocks,
-      hip_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
-                                       MinBlocksPerSM>,
-      blockSize, sharedmem));
-}
+enum class BlockType { Max, Preferred };
 
-template <typename DriverType, bool constant>
-void hipOccupancy(int *numBlocks, int blockSize, int sharedmem) {
-  hipOccupancy<DriverType, constant, HIPTraits::MaxThreadsPerBlock, 1>(
-      numBlocks, blockSize, sharedmem);
+template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
+unsigned get_preferred_blocksize_impl() {
+  // FIXME_HIP - could be if constexpr for c++17
+  if (!HIPParallelLaunch<DriverType, LaunchBounds,
+                         LaunchMechanism>::default_launchbounds()) {
+    // use the user specified value
+    return LaunchBounds::maxTperB;
+  } else {
+    if (HIPParallelLaunch<DriverType, LaunchBounds,
+                          LaunchMechanism>::get_scratch_size() > 0) {
+      return HIPTraits::ConservativeThreadsPerBlock;
+    }
+    return HIPTraits::MaxThreadsPerBlock;
+  }
 }
 
-template <class FunctorType, class LaunchBounds, typename F>
-int hip_internal_get_block_size(const F &condition_check,
-                                const HIPInternal *hip_instance,
-                                const hipFuncAttributes &attr,
-                                const FunctorType &f,
-                                const size_t vector_length,
-                                const size_t shmem_block,
-                                const size_t shmem_thread) {
-  const int min_blocks_per_sm =
-      LaunchBounds::minBperSM == 0 ? 1 : LaunchBounds::minBperSM;
-  const int max_threads_per_block = LaunchBounds::maxTperB == 0
-                                        ? HIPTraits::MaxThreadsPerBlock
-                                        : LaunchBounds::maxTperB;
-
-  const int regs_per_wavefront  = std::max(attr.numRegs, 1);
-  const int regs_per_sm         = hip_instance->m_regsPerSM;
-  const int shmem_per_sm        = hip_instance->m_shmemPerSM;
-  const int max_shmem_per_block = hip_instance->m_maxShmemPerBlock;
-  const int max_blocks_per_sm   = hip_instance->m_maxBlocksPerSM;
-  const int max_threads_per_sm  = hip_instance->m_maxThreadsPerSM;
-
-  int block_size = max_threads_per_block;
-  KOKKOS_ASSERT(block_size > 0);
-  const int blocks_per_warp =
-      (block_size + HIPTraits::WarpSize - 1) / HIPTraits::WarpSize;
-
-  int functor_shmem = ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
-      f, block_size / vector_length);
-  int total_shmem = shmem_block + shmem_thread * (block_size / vector_length) +
-                    functor_shmem + attr.sharedSizeBytes;
-  int max_blocks_regs = regs_per_sm / (regs_per_wavefront * blocks_per_warp);
-  int max_blocks_shmem =
-      (total_shmem < max_shmem_per_block)
-          ? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs)
-          : 0;
-  int blocks_per_sm  = std::min(max_blocks_regs, max_blocks_shmem);
-  int threads_per_sm = blocks_per_sm * block_size;
-  if (threads_per_sm > max_threads_per_sm) {
-    blocks_per_sm  = max_threads_per_sm / block_size;
-    threads_per_sm = blocks_per_sm * block_size;
+// FIXME_HIP - entire function could be constexpr for c++17
+template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
+unsigned get_max_blocksize_impl() {
+  // FIXME_HIP - could be if constexpr for c++17
+  if (!HIPParallelLaunch<DriverType, LaunchBounds,
+                         LaunchMechanism>::default_launchbounds()) {
+    // use the user specified value
+    return LaunchBounds::maxTperB;
+  } else {
+    // we can always fit 1024 threads blocks if we only care about registers
+    // ... and don't mind spilling
+    return HIPTraits::MaxThreadsPerBlock;
   }
-  int opt_block_size =
-      (blocks_per_sm >= min_blocks_per_sm) ? block_size : min_blocks_per_sm;
-  int opt_threads_per_sm = threads_per_sm;
-  block_size -= HIPTraits::WarpSize;
-  while (condition_check(blocks_per_sm) &&
-         (block_size >= HIPTraits::WarpSize)) {
-    functor_shmem = ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
-        f, block_size / vector_length);
-    total_shmem = shmem_block + shmem_thread * (block_size / vector_length) +
-                  functor_shmem + attr.sharedSizeBytes;
-    max_blocks_regs = regs_per_sm / (regs_per_wavefront * blocks_per_warp);
-    max_blocks_shmem =
-        (total_shmem < max_shmem_per_block)
-            ? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs)
-            : 0;
-    blocks_per_sm  = std::min(max_blocks_regs, max_blocks_shmem);
-    threads_per_sm = blocks_per_sm * block_size;
-    if (threads_per_sm > max_threads_per_sm) {
-      blocks_per_sm  = max_threads_per_sm / block_size;
-      threads_per_sm = blocks_per_sm * block_size;
-    }
-    if ((blocks_per_sm >= min_blocks_per_sm) &&
-        (blocks_per_sm <= max_blocks_per_sm)) {
-      if (threads_per_sm >= opt_threads_per_sm) {
-        opt_block_size     = block_size;
-        opt_threads_per_sm = threads_per_sm;
+}
+
+// convenience method to select and return the proper function attributes
+// for a kernel, given the launch bounds et al.
+template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
+          BlockType BlockSize = BlockType::Max,
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
+hipFuncAttributes get_hip_func_attributes_impl() {
+  // FIXME_HIP - could be if constexpr for c++17
+  if (!HIPParallelLaunch<DriverType, LaunchBounds,
+                         LaunchMechanism>::default_launchbounds()) {
+    // for user defined, we *always* honor the request
+    return HIPParallelLaunch<DriverType, LaunchBounds,
+                             LaunchMechanism>::get_hip_func_attributes();
+  } else {
+    // FIXME_HIP - could be if constexpr for c++17
+    if (BlockSize == BlockType::Max) {
+      return HIPParallelLaunch<
+          DriverType, Kokkos::LaunchBounds<HIPTraits::MaxThreadsPerBlock, 1>,
+          LaunchMechanism>::get_hip_func_attributes();
+    } else {
+      const int blocksize =
+          get_preferred_blocksize_impl<DriverType, LaunchBounds,
+                                       LaunchMechanism>();
+      if (blocksize == HIPTraits::MaxThreadsPerBlock) {
+        return HIPParallelLaunch<
+            DriverType, Kokkos::LaunchBounds<HIPTraits::MaxThreadsPerBlock, 1>,
+            LaunchMechanism>::get_hip_func_attributes();
+      } else {
+        return HIPParallelLaunch<
+            DriverType,
+            Kokkos::LaunchBounds<HIPTraits::ConservativeThreadsPerBlock, 1>,
+            LaunchMechanism>::get_hip_func_attributes();
       }
     }
-    block_size -= HIPTraits::WarpSize;
   }
-  return opt_block_size;
 }
 
-template <class FunctorType, class LaunchBounds>
-int hip_get_max_block_size(const HIPInternal *hip_instance,
-                           const hipFuncAttributes &attr, const FunctorType &f,
-                           const size_t vector_length, const size_t shmem_block,
-                           const size_t shmem_thread) {
-  return hip_internal_get_block_size<FunctorType, LaunchBounds>(
-      [](int x) { return x == 0; }, hip_instance, attr, f, vector_length,
-      shmem_block, shmem_thread);
+// Given an initial block-size limitation based on register usage
+// determine the block size to select based on LDS limitation
+template <BlockType BlockSize, class DriverType, class LaunchBounds,
+          typename ShmemFunctor>
+unsigned hip_internal_get_block_size(const HIPInternal *hip_instance,
+                                     const ShmemFunctor &f,
+                                     const unsigned tperb_reg) {
+  // translate LB from CUDA to HIP
+  const unsigned min_waves_per_eu =
+      LaunchBounds::minBperSM ? LaunchBounds::minBperSM : 1;
+  const unsigned min_threads_per_sm = min_waves_per_eu * HIPTraits::WarpSize;
+  const unsigned shmem_per_sm       = hip_instance->m_shmemPerSM;
+  unsigned block_size               = tperb_reg;
+  do {
+    unsigned total_shmem = f(block_size);
+    // find how many threads we can fit with this blocksize based on LDS usage
+    unsigned tperb_shmem = total_shmem > shmem_per_sm ? 0 : block_size;
+
+    // FIXME_HIP - could be if constexpr for c++17
+    if (BlockSize == BlockType::Max) {
+      // we want the maximum blocksize possible
+      // just wait until we get a case where we can fit the LDS per SM
+      if (tperb_shmem) return block_size;
+    } else {
+      if (block_size == tperb_reg && tperb_shmem >= tperb_reg) {
+        // fast path for exit on first iteration if registers are more limiting
+        // than LDS usage, just use the register limited size
+        return tperb_reg;
+      }
+      // otherwise we need to apply a heuristic to choose the blocksize
+      // the current launchbound selection scheme is:
+      //      1. If no spills, choose 1024 [MaxThreadsPerBlock]
+      //      2. Otherwise, choose 256 [ConservativeThreadsPerBlock]
+      //
+      // For blocksizes between 256 and 1024, we'll be forced to use the 1024 LB
+      // and we'll already have pretty decent occupancy, thus dropping to 256
+      // *probably* isn't a concern
+      const unsigned blocks_per_cu_shmem = shmem_per_sm / total_shmem;
+      const unsigned tperb = tperb_shmem < tperb_reg ? tperb_shmem : tperb_reg;
+
+      // for anything with > 4 WF's & can fit multiple blocks
+      // we're probably not occupancy limited so just return that
+      if (blocks_per_cu_shmem > 1 &&
+          tperb > HIPTraits::ConservativeThreadsPerBlock) {
+        return block_size;
+      }
+
+      // otherwise, it's probably better to drop to the first valid size that
+      // fits in the ConservativeThreadsPerBlock
+      if (tperb >= min_threads_per_sm) return block_size;
+    }
+    block_size >>= 1;
+  } while (block_size >= HIPTraits::WarpSize);
+  // TODO: return a negative, add an error to kernel launch
+  return 0;
+}
+
+// Standardized blocksize deduction for parallel constructs with no LDS usage
+// Returns the preferred blocksize as dictated by register usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds>
+unsigned hip_get_preferred_blocksize() {
+  return get_preferred_blocksize_impl<DriverType, LaunchBounds>();
+}
+
+// Standardized blocksize deduction for parallel constructs with no LDS usage
+// Returns the max blocksize as dictated by register usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds>
+unsigned hip_get_max_blocksize() {
+  return get_max_blocksize_impl<DriverType, LaunchBounds>();
+}
+
+// Standardized blocksize deduction for non-teams parallel constructs with LDS
+// usage Returns the 'preferred' blocksize, as determined by the heuristics in
+// hip_internal_get_block_size
+//
+// The ShmemFunctor takes a single argument of the current blocksize under
+// consideration, and returns the LDS usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds, typename ShmemFunctor>
+unsigned hip_get_preferred_blocksize(HIPInternal const *hip_instance,
+                                     ShmemFunctor const &f) {
+  // get preferred blocksize limited by register usage
+  const unsigned tperb_reg =
+      hip_get_preferred_blocksize<DriverType, LaunchBounds>();
+  return hip_internal_get_block_size<BlockType::Preferred, DriverType,
+                                     LaunchBounds>(hip_instance, f, tperb_reg);
+}
+
+// Standardized blocksize deduction for teams-based parallel constructs with LDS
+// usage Returns the 'preferred' blocksize, as determined by the heuristics in
+// hip_internal_get_block_size
+//
+// The ShmemTeamsFunctor takes two arguments: the hipFunctionAttributes and
+//  the current blocksize under consideration, and returns the LDS usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds,
+          typename ShmemTeamsFunctor>
+unsigned hip_get_preferred_team_blocksize(HIPInternal const *hip_instance,
+                                          ShmemTeamsFunctor const &f) {
+  hipFuncAttributes attr =
+      get_hip_func_attributes_impl<DriverType, LaunchBounds,
+                                   BlockType::Preferred>();
+  // get preferred blocksize limited by register usage
+  using namespace std::placeholders;
+  const unsigned tperb_reg =
+      hip_get_preferred_blocksize<DriverType, LaunchBounds>();
+  return hip_internal_get_block_size<BlockType::Preferred, DriverType,
+                                     LaunchBounds>(
+      hip_instance, std::bind(f, attr, _1), tperb_reg);
+}
+
+// Standardized blocksize deduction for non-teams parallel constructs with LDS
+// usage Returns the maximum possible blocksize, as determined by the heuristics
+// in hip_internal_get_block_size
+//
+// The ShmemFunctor takes a single argument of the current blocksize under
+// consideration, and returns the LDS usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds, typename ShmemFunctor>
+unsigned hip_get_max_blocksize(HIPInternal const *hip_instance,
+                               ShmemFunctor const &f) {
+  // get max blocksize limited by register usage
+  const unsigned tperb_reg = hip_get_max_blocksize<DriverType, LaunchBounds>();
+  return hip_internal_get_block_size<BlockType::Max, DriverType, LaunchBounds>(
+      hip_instance, f, tperb_reg);
 }
 
-template <typename FunctorType, typename LaunchBounds>
-int hip_get_opt_block_size(HIPInternal const *hip_instance,
-                           hipFuncAttributes const &attr, FunctorType const &f,
-                           size_t const vector_length, size_t const shmem_block,
-                           size_t const shmem_thread) {
-  return hip_internal_get_block_size<FunctorType, LaunchBounds>(
-      [](int) { return true; }, hip_instance, attr, f, vector_length,
-      shmem_block, shmem_thread);
+// Standardized blocksize deduction for teams-based parallel constructs with LDS
+// usage Returns the maximum possible blocksize, as determined by the heuristics
+// in hip_internal_get_block_size
+//
+// The ShmemTeamsFunctor takes two arguments: the hipFunctionAttributes and
+//  the current blocksize under consideration, and returns the LDS usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds,
+          typename ShmemTeamsFunctor>
+unsigned hip_get_max_team_blocksize(HIPInternal const *hip_instance,
+                                    ShmemTeamsFunctor const &f) {
+  hipFuncAttributes attr =
+      get_hip_func_attributes_impl<DriverType, LaunchBounds, BlockType::Max>();
+  // get max blocksize
+  using namespace std::placeholders;
+  const unsigned tperb_reg = hip_get_max_blocksize<DriverType, LaunchBounds>();
+  return hip_internal_get_block_size<BlockType::Max, DriverType, LaunchBounds>(
+      hip_instance, std::bind(f, attr, _1), tperb_reg);
 }
 
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp
index b3480bcad00c7ec6bc1a011a49fe7f9ae5eba345..a75e7a4a6c9351c0d39f7b2f7e8719a1a81c0adf 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp
@@ -66,12 +66,30 @@ inline void hip_internal_safe_call(hipError_t e, const char* name,
   }
 }
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+
+KOKKOS_DEPRECATED
+inline void hip_internal_safe_call_deprecated(hipError_t e, const char* name,
+                                              const char* file = nullptr,
+                                              const int line   = 0) {
+  hip_internal_safe_call(e, name, file, line);
+}
+
+#endif
+
 }  // namespace Impl
 }  // namespace Kokkos
 
-#define HIP_SAFE_CALL(call) \
+#define KOKKOS_IMPL_HIP_SAFE_CALL(call) \
   Kokkos::Impl::hip_internal_safe_call(call, #call, __FILE__, __LINE__)
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+#define HIP_SAFE_CALL(call)                                              \
+  Kokkos::Impl::hip_internal_safe_call_deprecated(call, #call, __FILE__, \
+                                                  __LINE__)
+
+#endif
+
 namespace Kokkos {
 namespace Experimental {
 
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
index 18ef10e22cd39b30118f78882a3ce747c19b9901..336ac8c6987c6538836f49792c41fd5520d0af8a 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
@@ -77,7 +77,7 @@ class HIPInternalDevices {
 };
 
 HIPInternalDevices::HIPInternalDevices() {
-  HIP_SAFE_CALL(hipGetDeviceCount(&m_hipDevCount));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&m_hipDevCount));
 
   if (m_hipDevCount > MAXIMUM_DEVICE_COUNT) {
     Kokkos::abort(
@@ -85,7 +85,7 @@ HIPInternalDevices::HIPInternalDevices() {
         "have. Please report this to github.com/kokkos/kokkos.");
   }
   for (int i = 0; i < m_hipDevCount; ++i) {
-    HIP_SAFE_CALL(hipGetDeviceProperties(m_hipProp + i, i));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceProperties(m_hipProp + i, i));
   }
 }
 
@@ -95,6 +95,9 @@ const HIPInternalDevices &HIPInternalDevices::singleton() {
 }
 }  // namespace
 
+unsigned long *Impl::HIPInternal::constantMemHostStaging = nullptr;
+hipEvent_t Impl::HIPInternal::constantMemReusable        = nullptr;
+
 namespace Impl {
 
 //----------------------------------------------------------------------------
@@ -154,6 +157,9 @@ int HIPInternal::verify_is_initialized(const char *const label) const {
   return 0 <= m_hipDev;
 }
 
+uint32_t HIPInternal::impl_get_instance_id() const noexcept {
+  return m_instance_id;
+}
 HIPInternal &HIPInternal::singleton() {
   static HIPInternal *self = nullptr;
   if (!self) {
@@ -163,12 +169,23 @@ HIPInternal &HIPInternal::singleton() {
 }
 
 void HIPInternal::fence() const {
-  HIP_SAFE_CALL(hipStreamSynchronize(m_stream));
-  // can reset our cycle id now as well
-  m_cycleId = 0;
+  fence("Kokkos::HIPInternal::fence: Unnamed Internal Fence");
+}
+void HIPInternal::fence(const std::string &name) const {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::HIP>(
+      name,
+      Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{
+          impl_get_instance_id()},
+      [&]() {
+        KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(m_stream));
+        // can reset our cycle id now as well
+        m_cycleId = 0;
+      });
 }
 
-void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
+void HIPInternal::initialize(int hip_device_id, hipStream_t stream,
+                             bool manage_stream) {
   if (was_finalized)
     Kokkos::abort("Calling HIP::initialize after HIP::finalize is illegal\n");
 
@@ -197,9 +214,10 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
     m_hipDev     = hip_device_id;
     m_deviceProp = hipProp;
 
-    HIP_SAFE_CALL(hipSetDevice(m_hipDev));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipSetDevice(m_hipDev));
 
     m_stream                    = stream;
+    m_manage_stream             = manage_stream;
     m_team_scratch_current_size = 0;
     m_team_scratch_ptr          = nullptr;
 
@@ -222,7 +240,7 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
     // theoretically, we can get 40 WF's / CU, but only can sustain 32
     // see
     // https://github.com/ROCm-Developer-Tools/HIP/blob/a0b5dfd625d99af7e288629747b40dd057183173/vdi/hip_platform.cpp#L742
-    m_maxBlocksPerSM = 32;
+    m_maxWavesPerCU = 32;
     // FIXME_HIP - Nick to implement this upstream
     //             Register count comes from Sec. 2.2. "Data Sharing" of the
     //             Vega 7nm ISA document (see the diagram)
@@ -232,7 +250,7 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
     m_regsPerSM        = 65536;
     m_shmemPerSM       = hipProp.maxSharedMemoryPerMultiProcessor;
     m_maxShmemPerBlock = hipProp.sharedMemPerBlock;
-    m_maxThreadsPerSM  = m_maxBlocksPerSM * HIPTraits::WarpSize;
+    m_maxThreadsPerSM  = m_maxWavesPerCU * HIPTraits::WarpSize;
     //----------------------------------
     // Multiblock reduction uses scratch flags for counters
     // and scratch space for partial reduction values.
@@ -265,8 +283,8 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
 
       m_scratchConcurrentBitset = reinterpret_cast<uint32_t *>(r->data());
 
-      HIP_SAFE_CALL(hipMemset(m_scratchConcurrentBitset, 0,
-                              sizeof(uint32_t) * buffer_bound));
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipMemset(m_scratchConcurrentBitset, 0,
+                                          sizeof(uint32_t) * buffer_bound));
     }
     //----------------------------------
 
@@ -287,6 +305,15 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
 
   // Init the array for used for arbitrarily sized atomics
   if (m_stream == nullptr) ::Kokkos::Impl::initialize_host_hip_lock_arrays();
+
+  // Allocate a staging buffer for constant mem in pinned host memory
+  // and an event to avoid overwriting driver for previous kernel launches
+  if (m_stream == nullptr) {
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipHostMalloc((void **)&constantMemHostStaging,
+                                            HIPTraits::ConstantMemoryUsage));
+
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipEventCreate(&constantMemReusable));
+  }
 }
 
 //----------------------------------------------------------------------------
@@ -339,7 +366,7 @@ Kokkos::Experimental::HIP::size_type *HIPInternal::scratch_flags(
 
     m_scratchFlags = reinterpret_cast<size_type *>(r->data());
 
-    HIP_SAFE_CALL(
+    KOKKOS_IMPL_HIP_SAFE_CALL(
         hipMemset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain));
   }
 
@@ -365,7 +392,7 @@ void *HIPInternal::resize_team_scratch_space(std::int64_t bytes,
 //----------------------------------------------------------------------------
 
 void HIPInternal::finalize() {
-  this->fence();
+  this->fence("Kokkos::HIPInternal::finalize: fence on finalization");
   was_finalized = true;
   if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) {
     using RecordHIP =
@@ -378,6 +405,9 @@ void HIPInternal::finalize() {
     if (m_team_scratch_current_size > 0)
       Kokkos::kokkos_free<Kokkos::Experimental::HIPSpace>(m_team_scratch_ptr);
 
+    if (m_manage_stream && m_stream != nullptr)
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(m_stream));
+
     m_hipDev                    = -1;
     m_hipArch                   = -1;
     m_multiProcCount            = 0;
@@ -395,28 +425,36 @@ void HIPInternal::finalize() {
     m_team_scratch_ptr          = nullptr;
   }
   if (nullptr != d_driverWorkArray) {
-    HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
     d_driverWorkArray = nullptr;
   }
+
+  // only destroy these if we're finalizing the singleton
+  if (this == &singleton()) {
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(constantMemHostStaging));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipEventDestroy(constantMemReusable));
+  }
 }
 
 char *HIPInternal::get_next_driver(size_t driverTypeSize) const {
   std::lock_guard<std::mutex> const lock(m_mutexWorkArray);
   if (d_driverWorkArray == nullptr) {
-    HIP_SAFE_CALL(
+    KOKKOS_IMPL_HIP_SAFE_CALL(
         hipHostMalloc(&d_driverWorkArray,
                       m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char),
                       hipHostMallocNonCoherent));
   }
   if (driverTypeSize > m_maxDriverTypeSize) {
     // fence handles the cycle id reset for us
-    fence();
-    HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
+    fence(
+        "Kokkos::HIPInternal::get_next_driver: fence before reallocating "
+        "resources");
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
     m_maxDriverTypeSize = driverTypeSize;
     if (m_maxDriverTypeSize % 128 != 0)
       m_maxDriverTypeSize =
           m_maxDriverTypeSize + 128 - m_maxDriverTypeSize % 128;
-    HIP_SAFE_CALL(
+    KOKKOS_IMPL_HIP_SAFE_CALL(
         hipHostMalloc(&d_driverWorkArray,
                       m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char),
                       hipHostMallocNonCoherent));
@@ -424,7 +462,9 @@ char *HIPInternal::get_next_driver(size_t driverTypeSize) const {
     m_cycleId = (m_cycleId + 1) % m_maxDriverCycles;
     if (m_cycleId == 0) {
       // ensure any outstanding kernels are completed before we wrap around
-      fence();
+      fence(
+          "Kokkos::HIPInternal::get_next_driver: fence before reusing first "
+          "driver");
     }
   }
   return &d_driverWorkArray[m_maxDriverTypeSize * m_cycleId];
@@ -462,7 +502,14 @@ Kokkos::Experimental::HIP::size_type *hip_internal_scratch_flags(
 
 namespace Kokkos {
 namespace Impl {
-void hip_device_synchronize() { HIP_SAFE_CALL(hipDeviceSynchronize()); }
+void hip_device_synchronize(const std::string &name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::HIP>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); });
+}
 
 void hip_internal_error_throw(hipError_t e, const char *name, const char *file,
                               const int line) {
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp
index f4f88628e313a2d22d23a09e4ce25630d242a566..967c6fdd4be63e11b00c6b7f97b8d3d0b27bbcfc 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp
@@ -48,6 +48,7 @@
 #define KOKKOS_HIP_INSTANCE_HPP
 
 #include <Kokkos_HIP_Space.hpp>
+#include <HIP/Kokkos_HIP_Error.hpp>
 
 #include <mutex>
 
@@ -59,10 +60,12 @@ struct HIPTraits {
   static int constexpr WarpSize       = 64;
   static int constexpr WarpIndexMask  = 0x003f; /* hexadecimal for 63 */
   static int constexpr WarpIndexShift = 6;      /* WarpSize == 1 << WarpShift*/
+  static int constexpr ConservativeThreadsPerBlock =
+      256;  // conservative fallback blocksize in case of spills
   static int constexpr MaxThreadsPerBlock =
-      1024;  // FIXME_HIP -- assumed constant for now
-
+      1024;  // the maximum we can fit in a block
   static int constexpr ConstantMemoryUsage        = 0x008000; /* 32k bytes */
+  static int constexpr KernelArgumentLimit        = 0x001000; /*  4k bytes */
   static int constexpr ConstantMemoryUseThreshold = 0x000200; /* 512 bytes */
 };
 
@@ -90,7 +93,7 @@ class HIPInternal {
   unsigned m_multiProcCount = 0;
   unsigned m_maxWarpCount   = 0;
   unsigned m_maxBlock       = 0;
-  unsigned m_maxBlocksPerSM = 0;
+  unsigned m_maxWavesPerCU  = 0;
   unsigned m_maxSharedWords = 0;
   int m_regsPerSM;
   int m_shmemPerSM       = 0;
@@ -108,6 +111,8 @@ class HIPInternal {
   mutable int m_cycleId = 0;
   // mutex to access d_driverWorkArray
   mutable std::mutex m_mutexWorkArray;
+  // mutex to access shared memory
+  mutable std::mutex m_mutexSharedMemory;
 
   // Scratch Spaces for Reductions
   size_type m_scratchSpaceCount = 0;
@@ -119,7 +124,10 @@ class HIPInternal {
 
   hipDeviceProp_t m_deviceProp;
 
-  hipStream_t m_stream = nullptr;
+  hipStream_t m_stream   = nullptr;
+  uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance<
+      Kokkos::Experimental::HIP>(reinterpret_cast<uintptr_t>(this));
+  bool m_manage_stream = false;
 
   // Team Scratch Level 1 Space
   mutable int64_t m_team_scratch_current_size = 0;
@@ -128,18 +136,25 @@ class HIPInternal {
 
   bool was_finalized = false;
 
+  // FIXME_HIP: these want to be per-device, not per-stream...  use of 'static'
+  // here will break once there are multiple devices though
+  static unsigned long *constantMemHostStaging;
+  static hipEvent_t constantMemReusable;
+
   static HIPInternal &singleton();
 
   int verify_is_initialized(const char *const label) const;
 
   int is_initialized() const { return m_hipDev >= 0; }
 
-  void initialize(int hip_device_id, hipStream_t stream = nullptr);
+  void initialize(int hip_device_id, hipStream_t stream = nullptr,
+                  bool manage_stream = false);
   void finalize();
 
   void print_configuration(std::ostream &) const;
 
   void fence() const;
+  void fence(const std::string &) const;
 
   // returns the next driver type pointer in our work array
   char *get_next_driver(size_t driverTypeSize) const;
@@ -151,13 +166,52 @@ class HIPInternal {
   // Resizing of reduction related scratch spaces
   size_type *scratch_space(const size_type size);
   size_type *scratch_flags(const size_type size);
-
+  uint32_t impl_get_instance_id() const noexcept;
   // Resizing of team level 1 scratch
   void *resize_team_scratch_space(std::int64_t bytes,
                                   bool force_shrink = false);
 };
 
 }  // namespace Impl
+
+// Partitioning an Execution Space: expects space and integer arguments for
+// relative weight
+//   Customization point for backends
+//   Default behavior is to return the passed in instance
+
+namespace Impl {
+inline void create_HIP_instances(std::vector<HIP> &instances) {
+  for (int s = 0; s < int(instances.size()); s++) {
+    hipStream_t stream;
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream));
+    instances[s] = HIP(stream, true);
+  }
+}
+}  // namespace Impl
+
+template <class... Args>
+std::vector<HIP> partition_space(const HIP &, Args...) {
+#ifdef __cpp_fold_expressions
+  static_assert(
+      (... && std::is_arithmetic_v<Args>),
+      "Kokkos Error: partitioning arguments must be integers or floats");
+#endif
+
+  std::vector<HIP> instances(sizeof...(Args));
+  Impl::create_HIP_instances(instances);
+  return instances;
+}
+
+template <class T>
+std::vector<HIP> partition_space(const HIP &, std::vector<T> &weights) {
+  static_assert(
+      std::is_arithmetic<T>::value,
+      "Kokkos Error: partitioning arguments must be integers or floats");
+
+  std::vector<HIP> instances(weights.size());
+  Impl::create_HIP_instances(instances);
+  return instances;
+}
 }  // namespace Experimental
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp
index f774423b378b0753a98c9e4df512b599910028dd..f209edf7c04ecc9b0001c4527e1bcebc0f24b256 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp
@@ -52,6 +52,7 @@
 #include <HIP/Kokkos_HIP_Error.hpp>
 #include <HIP/Kokkos_HIP_Instance.hpp>
 #include <Kokkos_HIP_Space.hpp>
+#include <HIP/Kokkos_HIP_Locks.hpp>
 
 // Must use global variable on the device with HIP-Clang
 #ifdef __HIP__
@@ -64,7 +65,7 @@ namespace Kokkos {
 namespace Experimental {
 template <typename T>
 inline __device__ T *kokkos_impl_hip_shared_memory() {
-  HIP_DYNAMIC_SHARED(HIPSpace::size_type, sh);
+  extern __shared__ Kokkos::Experimental::HIPSpace::size_type sh[];
   return (T *)sh;
 }
 }  // namespace Experimental
@@ -74,10 +75,12 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
+// The hip_parallel_launch_*_memory code is identical to the cuda code
 template <typename DriverType>
 __global__ static void hip_parallel_launch_constant_memory() {
   const DriverType &driver = *(reinterpret_cast<const DriverType *>(
       kokkos_impl_hip_constant_memory_buffer));
+
   driver();
 }
 
@@ -87,12 +90,13 @@ __global__ __launch_bounds__(
   const DriverType &driver = *(reinterpret_cast<const DriverType *>(
       kokkos_impl_hip_constant_memory_buffer));
 
-  driver->operator()();
+  driver();
 }
 
 template <class DriverType>
 __global__ static void hip_parallel_launch_local_memory(
     const DriverType *driver) {
+  // FIXME_HIP driver() pass by copy
   driver->operator()();
 }
 
@@ -101,6 +105,21 @@ __global__ __launch_bounds__(
     maxTperB,
     minBperSM) static void hip_parallel_launch_local_memory(const DriverType
                                                                 *driver) {
+  // FIXME_HIP driver() pass by copy
+  driver->operator()();
+}
+
+template <typename DriverType>
+__global__ static void hip_parallel_launch_global_memory(
+    const DriverType *driver) {
+  driver->operator()();
+}
+
+template <typename DriverType, unsigned int maxTperB, unsigned int minBperSM>
+__global__ __launch_bounds__(
+    maxTperB,
+    minBperSM) static void hip_parallel_launch_global_memory(const DriverType
+                                                                 *driver) {
   driver->operator()();
 }
 
@@ -127,33 +146,238 @@ struct HIPDispatchProperties {
   HIPLaunchMechanism launch_mechanism = l;
 };
 
+// Use local memory up to ConstantMemoryUseThreshold
+// Use global memory above ConstantMemoryUsage
+// In between use ConstantMemory
+// The following code is identical to the cuda code
+template <typename DriverType>
+struct DeduceHIPLaunchMechanism {
+  static constexpr Kokkos::Experimental::WorkItemProperty::HintLightWeight_t
+      light_weight = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
+  static constexpr Kokkos::Experimental::WorkItemProperty::HintHeavyWeight_t
+      heavy_weight = Kokkos::Experimental::WorkItemProperty::HintHeavyWeight;
+  static constexpr typename DriverType::Policy::work_item_property property =
+      typename DriverType::Policy::work_item_property();
+
+  static constexpr HIPLaunchMechanism valid_launch_mechanism =
+      // BuildValidMask
+      (sizeof(DriverType) < HIPTraits::KernelArgumentLimit
+           ? HIPLaunchMechanism::LocalMemory
+           : HIPLaunchMechanism::Default) |
+      (sizeof(DriverType) < HIPTraits::ConstantMemoryUsage
+           ? HIPLaunchMechanism::ConstantMemory
+           : HIPLaunchMechanism::Default) |
+      HIPLaunchMechanism::GlobalMemory;
+
+  static constexpr HIPLaunchMechanism requested_launch_mechanism =
+      (((property & light_weight) == light_weight)
+           ? HIPLaunchMechanism::LocalMemory
+           : HIPLaunchMechanism::ConstantMemory) |
+      HIPLaunchMechanism::GlobalMemory;
+
+  static constexpr HIPLaunchMechanism default_launch_mechanism =
+      // BuildValidMask
+      (sizeof(DriverType) < HIPTraits::ConstantMemoryUseThreshold)
+          ? HIPLaunchMechanism::LocalMemory
+          : ((sizeof(DriverType) < HIPTraits::ConstantMemoryUsage)
+                 ? HIPLaunchMechanism::ConstantMemory
+                 : HIPLaunchMechanism::GlobalMemory);
+
+  //              None                LightWeight    HeavyWeight
+  // F<UseT       LCG  LCG L  L       LCG  LG L  L   LCG  CG L  C
+  // UseT<F<KAL   LCG  LCG C  C       LCG  LG C  L   LCG  CG C  C
+  // Kal<F<CMU     CG  LCG C  C        CG  LG C  G    CG  CG C  C
+  // CMU<F          G  LCG G  G         G  LG G  G     G  CG G  G
+  static constexpr HIPLaunchMechanism launch_mechanism =
+      ((property & light_weight) == light_weight)
+          ? (sizeof(DriverType) < HIPTraits::KernelArgumentLimit
+                 ? HIPLaunchMechanism::LocalMemory
+                 : HIPLaunchMechanism::GlobalMemory)
+          : (((property & heavy_weight) == heavy_weight)
+                 ? (sizeof(DriverType) < HIPTraits::ConstantMemoryUsage
+                        ? HIPLaunchMechanism::ConstantMemory
+                        : HIPLaunchMechanism::GlobalMemory)
+                 : (default_launch_mechanism));
+};
+
+template <typename DriverType, typename LaunchBounds,
+          HIPLaunchMechanism LaunchMechanism>
+struct HIPParallelLaunchKernelFuncData {
+  static unsigned int get_scratch_size(
+      hipFuncAttributes const &hip_func_attributes) {
+    return hip_func_attributes.localSizeBytes;
+  }
+
+  static hipFuncAttributes get_hip_func_attributes(void const *kernel_func) {
+    static hipFuncAttributes attr = [=]() {
+      hipFuncAttributes attr;
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipFuncGetAttributes(&attr, kernel_func));
+      return attr;
+    }();
+    return attr;
+  }
+};
+
+//---------------------------------------------------------------//
+// HIPParallelLaunchKernelFunc structure and its specializations //
+//---------------------------------------------------------------//
 template <typename DriverType, typename LaunchBounds,
           HIPLaunchMechanism LaunchMechanism>
 struct HIPParallelLaunchKernelFunc;
 
+// HIPLaunchMechanism::LocalMemory specializations
 template <typename DriverType, unsigned int MaxThreadsPerBlock,
           unsigned int MinBlocksPerSM>
 struct HIPParallelLaunchKernelFunc<
     DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
     HIPLaunchMechanism::LocalMemory> {
+  using funcdata_t = HIPParallelLaunchKernelFuncData<
+      DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+      HIPLaunchMechanism::LocalMemory>;
   static auto get_kernel_func() {
     return hip_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
                                             MinBlocksPerSM>;
   }
+
+  static constexpr auto default_launchbounds() { return false; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
 };
 
 template <typename DriverType>
 struct HIPParallelLaunchKernelFunc<DriverType, Kokkos::LaunchBounds<0, 0>,
                                    HIPLaunchMechanism::LocalMemory> {
+  using funcdata_t =
+      HIPParallelLaunchKernelFuncData<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                      HIPLaunchMechanism::LocalMemory>;
+  static auto get_kernel_func() {
+    return HIPParallelLaunchKernelFunc<
+        DriverType, Kokkos::LaunchBounds<HIPTraits::MaxThreadsPerBlock, 1>,
+        HIPLaunchMechanism::LocalMemory>::get_kernel_func();
+  }
+
+  static constexpr auto default_launchbounds() { return true; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
+};
+
+// HIPLaunchMechanism::GlobalMemory specializations
+template <typename DriverType, unsigned int MaxThreadsPerBlock,
+          unsigned int MinBlocksPerSM>
+struct HIPParallelLaunchKernelFunc<
+    DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+    HIPLaunchMechanism::GlobalMemory> {
+  using funcdata_t = HIPParallelLaunchKernelFuncData<
+      DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+      HIPLaunchMechanism::GlobalMemory>;
+  static auto get_kernel_func() {
+    return hip_parallel_launch_global_memory<DriverType, MaxThreadsPerBlock,
+                                             MinBlocksPerSM>;
+  }
+
+  static constexpr auto default_launchbounds() { return false; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
+};
+
+template <typename DriverType>
+struct HIPParallelLaunchKernelFunc<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                   HIPLaunchMechanism::GlobalMemory> {
+  using funcdata_t =
+      HIPParallelLaunchKernelFuncData<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                      HIPLaunchMechanism::GlobalMemory>;
+  static auto get_kernel_func() {
+    return hip_parallel_launch_global_memory<DriverType>;
+  }
+
+  static constexpr auto default_launchbounds() { return true; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
+};
+
+// HIPLaunchMechanism::ConstantMemory specializations
+template <typename DriverType, unsigned int MaxThreadsPerBlock,
+          unsigned int MinBlocksPerSM>
+struct HIPParallelLaunchKernelFunc<
+    DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+    HIPLaunchMechanism::ConstantMemory> {
+  using funcdata_t = HIPParallelLaunchKernelFuncData<
+      DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+      HIPLaunchMechanism::ConstantMemory>;
   static auto get_kernel_func() {
-    return hip_parallel_launch_local_memory<DriverType, 1024, 1>;
+    return hip_parallel_launch_constant_memory<DriverType, MaxThreadsPerBlock,
+                                               MinBlocksPerSM>;
+  }
+
+  static constexpr auto default_launchbounds() { return false; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
   }
 };
 
+template <typename DriverType>
+struct HIPParallelLaunchKernelFunc<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                   HIPLaunchMechanism::ConstantMemory> {
+  using funcdata_t =
+      HIPParallelLaunchKernelFuncData<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                      HIPLaunchMechanism::ConstantMemory>;
+  static auto get_kernel_func() {
+    return hip_parallel_launch_constant_memory<DriverType>;
+  }
+  static constexpr auto default_launchbounds() { return true; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
+};
+
+//------------------------------------------------------------------//
+// HIPParallelLaunchKernelInvoker structure and its specializations //
+//------------------------------------------------------------------//
 template <typename DriverType, typename LaunchBounds,
           HIPLaunchMechanism LaunchMechanism>
 struct HIPParallelLaunchKernelInvoker;
 
+// HIPLaunchMechanism::LocalMemory specialization
 template <typename DriverType, typename LaunchBounds>
 struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
                                       HIPLaunchMechanism::LocalMemory>
@@ -170,21 +394,83 @@ struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
   }
 };
 
+// HIPLaunchMechanism::GlobalMemory specialization
+template <typename DriverType, typename LaunchBounds>
+struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
+                                      HIPLaunchMechanism::GlobalMemory>
+    : HIPParallelLaunchKernelFunc<DriverType, LaunchBounds,
+                                  HIPLaunchMechanism::GlobalMemory> {
+  using base_t = HIPParallelLaunchKernelFunc<DriverType, LaunchBounds,
+                                             HIPLaunchMechanism::GlobalMemory>;
+
+  // FIXME_HIP the code is different than cuda because driver cannot be passed
+  // by copy
+  static void invoke_kernel(DriverType const *driver, dim3 const &grid,
+                            dim3 const &block, int shmem,
+                            HIPInternal const *hip_instance) {
+    (base_t::get_kernel_func())<<<grid, block, shmem, hip_instance->m_stream>>>(
+        driver);
+  }
+};
+
+// HIPLaunchMechanism::ConstantMemory specializations
+template <typename DriverType, typename LaunchBounds>
+struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
+                                      HIPLaunchMechanism::ConstantMemory>
+    : HIPParallelLaunchKernelFunc<DriverType, LaunchBounds,
+                                  HIPLaunchMechanism::ConstantMemory> {
+  using base_t =
+      HIPParallelLaunchKernelFunc<DriverType, LaunchBounds,
+                                  HIPLaunchMechanism::ConstantMemory>;
+  static_assert(sizeof(DriverType) < HIPTraits::ConstantMemoryUsage,
+                "Kokkos Error: Requested HIPLaunchConstantMemory with a "
+                "Functor larger than 32kB.");
+
+  static void invoke_kernel(DriverType const *driver, dim3 const &grid,
+                            dim3 const &block, int shmem,
+                            HIPInternal const *hip_instance) {
+    // Wait until the previous kernel that uses the constant buffer is done
+    KOKKOS_IMPL_HIP_SAFE_CALL(
+        hipEventSynchronize(hip_instance->constantMemReusable));
+
+    // Copy functor (synchronously) to staging buffer in pinned host memory
+    unsigned long *staging = hip_instance->constantMemHostStaging;
+    std::memcpy((void *)staging, (void *)driver, sizeof(DriverType));
+
+    // Copy functor asynchronously from there to constant memory on the device
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyToSymbolAsync(
+        HIP_SYMBOL(kokkos_impl_hip_constant_memory_buffer), staging,
+        sizeof(DriverType), 0, hipMemcpyHostToDevice, hip_instance->m_stream));
+
+    // Invoke the driver function on the device
+    (base_t::
+         get_kernel_func())<<<grid, block, shmem, hip_instance->m_stream>>>();
+
+    // Record an event that says when the constant buffer can be reused
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipEventRecord(hip_instance->constantMemReusable,
+                                             hip_instance->m_stream));
+  }
+};
+
+//-----------------------------//
+// HIPParallelLaunch structure //
+//-----------------------------//
 template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
-          HIPLaunchMechanism LaunchMechanism = HIPLaunchMechanism::LocalMemory>
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
 struct HIPParallelLaunch;
 
 template <typename DriverType, unsigned int MaxThreadsPerBlock,
-          unsigned int MinBlocksPerSM>
+          unsigned int MinBlocksPerSM, HIPLaunchMechanism LaunchMechanism>
 struct HIPParallelLaunch<
     DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
-    HIPLaunchMechanism::LocalMemory>
+    LaunchMechanism>
     : HIPParallelLaunchKernelInvoker<
           DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
-          HIPLaunchMechanism::LocalMemory> {
+          LaunchMechanism> {
   using base_t = HIPParallelLaunchKernelInvoker<
       DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
-      HIPLaunchMechanism::LocalMemory>;
+      LaunchMechanism>;
 
   HIPParallelLaunch(const DriverType &driver, const dim3 &grid,
                     const dim3 &block, const int shmem,
@@ -205,22 +491,48 @@ struct HIPParallelLaunch<
       base_t::invoke_kernel(d_driver, grid, block, shmem, hip_instance);
 
 #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
-      HIP_SAFE_CALL(hipGetLastError());
-      hip_instance->fence();
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipGetLastError());
+      hip_instance->fence(
+          "Kokkos::Experimental::Impl::HIParallelLaunch: Debug Only Check for "
+          "Execution Error");
 #endif
     }
   }
+};
 
-  static hipFuncAttributes get_hip_func_attributes() {
-    static hipFuncAttributes attr = []() {
-      hipFuncAttributes attr;
-      HIP_SAFE_CALL(hipFuncGetAttributes(
-          &attr, reinterpret_cast<void const *>(base_t::get_kernel_func())));
-      return attr;
-    }();
-    return attr;
+// convenience method to launch the correct kernel given the launch bounds et
+// al.
+template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
+void hip_parallel_launch(const DriverType &driver, const dim3 &grid,
+                         const dim3 &block, const int shmem,
+                         const HIPInternal *hip_instance,
+                         const bool prefer_shmem) {
+  // FIXME_HIP - could be if constexpr for c++17
+  if (!HIPParallelLaunch<DriverType, LaunchBounds,
+                         LaunchMechanism>::default_launchbounds()) {
+    // for user defined, we *always* honor the request
+    HIPParallelLaunch<DriverType, LaunchBounds, LaunchMechanism>(
+        driver, grid, block, shmem, hip_instance, prefer_shmem);
+  } else {
+    // we can do what we like
+    const unsigned flat_block_size = block.x * block.y * block.z;
+    if (flat_block_size <= HIPTraits::ConservativeThreadsPerBlock) {
+      // we have to use the large blocksize
+      HIPParallelLaunch<
+          DriverType,
+          Kokkos::LaunchBounds<HIPTraits::ConservativeThreadsPerBlock, 1>,
+          LaunchMechanism>(driver, grid, block, shmem, hip_instance,
+                           prefer_shmem);
+    } else {
+      HIPParallelLaunch<DriverType,
+                        Kokkos::LaunchBounds<HIPTraits::MaxThreadsPerBlock, 1>,
+                        LaunchMechanism>(driver, grid, block, shmem,
+                                         hip_instance, prefer_shmem);
+    }
   }
-};
+}
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp
index 4f5271b6f644605e24ab277a7b08b25ba8c2ea84..c4292d35eca793bc58d76ba20db4358f85810996 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp
@@ -84,11 +84,17 @@ namespace Impl {
 HIPLockArrays g_host_hip_lock_arrays = {nullptr, nullptr, 0};
 
 void initialize_host_hip_lock_arrays() {
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+  desul::Impl::init_lock_arrays();
+
+  DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE();
+#endif
+
   if (g_host_hip_lock_arrays.atomic != nullptr) return;
-  HIP_SAFE_CALL(hipMalloc(
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(
       &g_host_hip_lock_arrays.atomic,
       sizeof(std::int32_t) * (KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1)));
-  HIP_SAFE_CALL(hipMalloc(
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(
       &g_host_hip_lock_arrays.scratch,
       sizeof(std::int32_t) * (::Kokkos::Experimental::HIP::concurrency())));
 
@@ -103,10 +109,14 @@ void initialize_host_hip_lock_arrays() {
 }
 
 void finalize_host_hip_lock_arrays() {
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+  desul::Impl::finalize_lock_arrays();
+#endif
+
   if (g_host_hip_lock_arrays.atomic == nullptr) return;
-  HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.atomic));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.atomic));
   g_host_hip_lock_arrays.atomic = nullptr;
-  HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.scratch));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.scratch));
   g_host_hip_lock_arrays.scratch = nullptr;
   g_host_hip_lock_arrays.n       = 0;
 #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp
index f34f85f43b0bb2ac2b07d4149957f5027991395f..71b104c2e4b65aff7ab3b3688c0901d000e8d9d8 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp
@@ -51,6 +51,10 @@
 
 #include <HIP/Kokkos_HIP_Error.hpp>
 
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#include <desul/atomics/Lock_Array_HIP.hpp>
+#endif
+
 namespace Kokkos {
 namespace Impl {
 
@@ -147,7 +151,7 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
 #define KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()                 \
   {                                                             \
     if (::Kokkos::Impl::lock_array_copied == 0) {               \
-      HIP_SAFE_CALL(hipMemcpyToSymbol(                          \
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyToSymbol(              \
           HIP_SYMBOL(::Kokkos::Impl::g_device_hip_lock_arrays), \
           &::Kokkos::Impl::g_host_hip_lock_arrays,              \
           sizeof(::Kokkos::Impl::HIPLockArrays)));              \
@@ -155,6 +159,8 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
     ::Kokkos::Impl::lock_array_copied = 1;                      \
   }
 
+#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+
 #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
 #define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE()
 #else
@@ -162,6 +168,19 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
   KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()
 #endif
 
+#else
+
+#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
+#define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE()
+#else
+// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
+#define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() \
+  KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()         \
+  DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE()
+#endif
+
+#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */
+
 #endif /* defined( __HIPCC__ ) */
 
 #endif /* #ifndef KOKKOS_HIP_LOCKS_HPP */
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp
index ce1aff9586d25911104d17d53860409f3e73b10b..acb538e1cb3970bab9bafc2f44d3568b34c6c31f 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp
@@ -28,7 +28,8 @@ inline TileSizeProperties get_tile_size_properties<Kokkos::Experimental::HIP>(
       space.impl_internal_space_instance()->m_maxThreadsPerSM;
   properties.default_largest_tile_size = 16;
   properties.default_tile_size         = 4;
-  properties.max_total_tile_size       = 1024;
+  properties.max_total_tile_size =
+      Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock;
   return properties;
 }
 
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
index 35e7d6fb853ae9e4f245e0fe0c2a71f4f2d4d6c2..eae323dd913d7f20383e3afcf5f1264d013b7be1 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
@@ -81,6 +81,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
   }
 
   inline void execute() const {
+    using ClosureType =
+        ParallelFor<FunctorType, Policy, Kokkos::Experimental::HIP>;
     if (m_policy.m_num_tiles == 0) return;
     array_index_type const maxblocks = static_cast<array_index_type>(
         m_policy.space().impl_internal_space_instance()->m_maxBlock);
@@ -94,7 +96,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
                        block.y,
                    maxblocks),
           1);
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
           *this, grid, block, 0,
           m_policy.space().impl_internal_space_instance(), false);
     } else if (Policy::rank == 3) {
@@ -110,7 +113,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
           std::min((m_policy.m_upper[2] - m_policy.m_lower[2] + block.z - 1) /
                        block.z,
                    maxblocks));
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
           *this, grid, block, 0,
           m_policy.space().impl_internal_space_instance(), false);
     } else if (Policy::rank == 4) {
@@ -128,7 +132,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
           std::min((m_policy.m_upper[3] - m_policy.m_lower[3] + block.z - 1) /
                        block.z,
                    maxblocks));
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
           *this, grid, block, 0,
           m_policy.space().impl_internal_space_instance(), false);
     } else if (Policy::rank == 5) {
@@ -147,7 +152,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
           std::min((m_policy.m_upper[4] - m_policy.m_lower[4] + block.z - 1) /
                        block.z,
                    maxblocks));
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
           *this, grid, block, 0,
           m_policy.space().impl_internal_space_instance(), false);
     } else if (Policy::rank == 6) {
@@ -165,7 +171,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
                       std::min(static_cast<index_type>(m_policy.m_tile_end[4] *
                                                        m_policy.m_tile_end[5]),
                                static_cast<index_type>(maxblocks)));
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
           *this, grid, block, 0,
           m_policy.space().impl_internal_space_instance(), false);
     } else {
@@ -178,22 +185,18 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
       : m_functor(arg_functor), m_policy(arg_policy) {}
 
   template <typename Policy, typename Functor>
-  static int max_tile_size_product(const Policy& pol, const Functor&) {
+  static int max_tile_size_product(const Policy&, const Functor&) {
     using closure_type =
         ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
                     Kokkos::Experimental::HIP>;
-    hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type, LaunchBounds>::get_hip_func_attributes();
-    auto const& prop = pol.space().hip_device_prop();
-    // Limits due to registers/SM, MDRange doesn't have
-    // shared memory constraints
-    int const regs_per_sm        = prop.regsPerMultiprocessor;
-    int const regs_per_thread    = attr.numRegs;
-    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
-    return std::min(
-        max_threads_per_sm,
-        static_cast<int>(
-            Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock));
+    unsigned block_size =
+        Kokkos::Experimental::Impl::hip_get_max_blocksize<closure_type,
+                                                          LaunchBounds>();
+    if (block_size == 0)
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelFor< HIP > could not find a valid "
+                      "tile size."));
+    return block_size;
   }
 };
 
@@ -242,6 +245,9 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   const bool m_result_ptr_device_accessible;
   size_type* m_scratch_space;
   size_type* m_scratch_flags;
+  // Only let one Parallel/Scan modify the shared memory. The
+  // constructor acquires the mutex which is released in the destructor.
+  std::unique_lock<std::mutex> m_shared_memory_lock;
 
   using DeviceIteratePattern = typename Kokkos::Impl::Reduce::DeviceIterateTile<
       Policy::rank, Policy, FunctorType, WorkTag, reference_type>;
@@ -307,32 +313,30 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   // Determine block size constrained by shared memory:
   // This is copy/paste from Kokkos_HIP_Parallel_Range
   inline unsigned local_block_size(const FunctorType& f) {
-    unsigned int n =
-        ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock;
-    int shmem_size = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem<
-        false, FunctorType, WorkTag>(f, n);
-    using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>;
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type, LaunchBounds>::get_hip_func_attributes();
-    while (
-        (n &&
-         (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
-          shmem_size)) ||
-        (n >
-         static_cast<unsigned>(
-             ::Kokkos::Experimental::Impl::hip_get_max_block_size<FunctorType,
-                                                                  LaunchBounds>(
-                 m_policy.space().impl_internal_space_instance(), attr, f, 1,
-                 shmem_size, 0)))) {
-      n >>= 1;
-      shmem_size = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem<
-          false, FunctorType, WorkTag>(f, n);
+    const auto& instance = m_policy.space().impl_internal_space_instance();
+    auto shmem_functor   = [&f](unsigned n) {
+      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                      WorkTag>(f, n);
+    };
+    using closure_type = ParallelReduce<FunctorType, Policy, ReducerType,
+                                        Kokkos::Experimental::HIP>;
+
+    unsigned block_size =
+        Kokkos::Experimental::Impl::hip_get_preferred_blocksize<closure_type,
+                                                                LaunchBounds>(
+            instance, shmem_functor);
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a "
+                      "valid tile size."));
     }
-    return n;
+    return block_size;
   }
 
   inline void execute() {
-    const int nwork = m_policy.m_num_tiles;
+    using ClosureType = ParallelReduce<FunctorType, Policy, ReducerType,
+                                       Kokkos::Experimental::HIP>;
+    const int nwork   = m_policy.m_num_tiles;
     if (nwork) {
       int block_size = m_policy.m_prod_tile_dims;
       // CONSTRAINT: Algorithm requires block_size >= product of tile dimensions
@@ -366,14 +370,16 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
           ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem<
               false, FunctorType, WorkTag>(m_functor, block.y);
 
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelReduce,
-                                                    LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
           *this, grid, block, shmem,
           m_policy.space().impl_internal_space_instance(),
           false);  // copy to device and execute
 
       if (!m_result_ptr_device_accessible) {
-        m_policy.space().fence();
+        m_policy.space().fence(
+            "Kokkos::Impl::ParallelReduce<MDRangePolicy,HIP>: fence because "
+            "reduction can't access result storage location");
 
         if (m_result_ptr) {
           const int size = ValueTraits::value_size(
@@ -403,7 +409,10 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
             MemorySpaceAccess<Kokkos::Experimental::HIPSpace,
                               typename ViewType::memory_space>::accessible),
         m_scratch_space(nullptr),
-        m_scratch_flags(nullptr) {}
+        m_scratch_flags(nullptr),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}
 
   ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
                  const ReducerType& reducer)
@@ -416,23 +425,25 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                               typename ReducerType::result_view_type::
                                   memory_space>::accessible),
         m_scratch_space(nullptr),
-        m_scratch_flags(nullptr) {}
+        m_scratch_flags(nullptr),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}
+
   template <typename Policy, typename Functor>
-  static int max_tile_size_product(const Policy& pol, const Functor&) {
+  static int max_tile_size_product(const Policy&, const Functor&) {
     using closure_type =
         ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>,
                        ReducerType, Kokkos::Experimental::HIP>;
-    hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type, LaunchBounds>::get_hip_func_attributes();
-    auto const& prop = pol.space().hip_device_prop();
-    // Limits due do registers/SM
-    int const regs_per_sm        = prop.regsPerMultiprocessor;
-    int const regs_per_thread    = attr.numRegs;
-    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
-    return std::min(
-        max_threads_per_sm,
-        static_cast<int>(
-            Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock));
+    unsigned block_size =
+        Kokkos::Experimental::Impl::hip_get_max_blocksize<closure_type,
+                                                          LaunchBounds>();
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a "
+                      "valid tile size."));
+    }
+    return block_size;
   }
 };
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
index 7d2825eeb4c6be1d060d1e8d7c3eb67097729ccf..e02ead1e990151a30d0a87b280bada8c774ca5c5 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
@@ -108,16 +108,21 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
   inline void execute() const {
     const typename Policy::index_type nwork = m_policy.end() - m_policy.begin();
 
+    using DriverType =
+        ParallelFor<FunctorType, Policy, Kokkos::Experimental::HIP>;
     const int block_size =
-        LaunchBounds::maxTperB
-            ? LaunchBounds::maxTperB
-            : ::Kokkos::Experimental::Impl::HIPTraits::
-                  MaxThreadsPerBlock;  // FIXME_HIP Choose block_size better
+        Kokkos::Experimental::Impl::hip_get_preferred_blocksize<DriverType,
+                                                                LaunchBounds>();
     const dim3 block(1, block_size, 1);
     const dim3 grid(
         typename Policy::index_type((nwork + block.y - 1) / block.y), 1, 1);
 
-    Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelFor< HIP > could not find a "
+                      "valid execution configuration."));
+    }
+    Kokkos::Experimental::Impl::hip_parallel_launch<DriverType, LaunchBounds>(
         *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
         false);
   }
@@ -173,15 +178,12 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   const bool m_result_ptr_host_accessible;
   size_type* m_scratch_space = nullptr;
   size_type* m_scratch_flags = nullptr;
+  // Only let one ParallelReduce/Scan modify the shared memory. The
+  // constructor acquires the mutex which is released in the destructor.
+  std::unique_lock<std::mutex> m_shared_memory_lock;
 
-#if HIP_VERSION < 401
-  static bool constexpr UseShflReduction =
-      ((sizeof(value_type) > 2 * sizeof(double)) &&
-       static_cast<bool>(ValueTraits::StaticValueSize));
-#else
   static bool constexpr UseShflReduction =
       static_cast<bool>(ValueTraits::StaticValueSize);
-#endif
 
  private:
   struct ShflReductionTag {};
@@ -328,30 +330,15 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
 
   // Determine block size constrained by shared memory:
   inline unsigned local_block_size(const FunctorType& f) {
-    unsigned int n =
-        ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock;
-    int shmem_size =
-        hip_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
-            f, n);
-    using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>;
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type, LaunchBounds>::get_hip_func_attributes();
-    while (
-        (n &&
-         (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
-          shmem_size)) ||
-        (n >
-         static_cast<unsigned int>(
-             ::Kokkos::Experimental::Impl::hip_get_max_block_size<FunctorType,
-                                                                  LaunchBounds>(
-                 m_policy.space().impl_internal_space_instance(), attr, f, 1,
-                 shmem_size, 0)))) {
-      n >>= 1;
-      shmem_size =
-          hip_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
-              f, n);
-    }
-    return n;
+    const auto& instance = m_policy.space().impl_internal_space_instance();
+    auto shmem_functor   = [&f](unsigned n) {
+      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                      WorkTag>(f, n);
+    };
+    using DriverType = ParallelReduce<FunctorType, Policy, ReducerType,
+                                      Kokkos::Experimental::HIP>;
+    return Kokkos::Experimental::Impl::hip_get_preferred_blocksize<
+        DriverType, LaunchBounds>(instance, shmem_functor);
   }
 
   inline void execute() {
@@ -362,7 +349,11 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                                  !std::is_same<ReducerType, InvalidType>::value;
     if ((nwork > 0) || need_device_set) {
       const int block_size = local_block_size(m_functor);
-      KOKKOS_ASSERT(block_size > 0);
+      if (block_size == 0) {
+        Kokkos::Impl::throw_runtime_exception(
+            std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a "
+                        "valid execution configuration."));
+      }
 
       m_scratch_space =
           ::Kokkos::Experimental::Impl::hip_internal_scratch_space(
@@ -391,14 +382,17 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                                                          WorkTag>(m_functor,
                                                                   block.y);
 
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelReduce,
-                                                    LaunchBounds>(
+      using DriverType = ParallelReduce<FunctorType, Policy, ReducerType,
+                                        Kokkos::Experimental::HIP>;
+      Kokkos::Experimental::Impl::hip_parallel_launch<DriverType, LaunchBounds>(
           *this, grid, block, shmem,
           m_policy.space().impl_internal_space_instance(),
           false);  // copy to device and execute
 
       if (!m_result_ptr_device_accessible) {
-        m_policy.space().impl_internal_space_instance()->fence();
+        m_policy.space().impl_internal_space_instance()->fence(
+            "Kokkos::Impl::ParallelReduce<RangePolicy,HIP>: fence because "
+            "reduction can't access result storage location");
 
         if (m_result_ptr) {
           const int size = ValueTraits::value_size(
@@ -429,7 +423,10 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                               typename ViewType::memory_space>::accessible),
         m_result_ptr_host_accessible(
             MemorySpaceAccess<Kokkos::HostSpace,
-                              typename ViewType::memory_space>::accessible) {}
+                              typename ViewType::memory_space>::accessible),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}
 
   ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
                  const ReducerType& reducer)
@@ -444,7 +441,10 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
         m_result_ptr_host_accessible(
             MemorySpaceAccess<Kokkos::HostSpace,
                               typename ReducerType::result_view_type::
-                                  memory_space>::accessible) {}
+                                  memory_space>::accessible),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}
 };
 
 template <class FunctorType, class... Traits>
@@ -482,6 +482,9 @@ class ParallelScanHIPBase {
   size_type* m_scratch_flags = nullptr;
   size_type m_final          = false;
   int m_grid_x               = 0;
+  // Only let one ParallelReduce/Scan modify the shared memory. The
+  // constructor acquires the mutex which is released in the destructor.
+  std::unique_lock<std::mutex> m_shared_memory_lock;
 
  private:
   template <class TagType>
@@ -624,22 +627,7 @@ class ParallelScanHIPBase {
   }
 
   // Determine block size constrained by shared memory:
-  inline unsigned local_block_size(const FunctorType& f) {
-    // blockDim.y must be power of two = 128 (2 warps) or 256 (4 warps) or
-    // 512 (8 warps) gridDim.x <= blockDim.y * blockDim.y
-    //
-    // TODO check best option
-
-    unsigned n = Experimental::Impl::HIPTraits::WarpSize * 4;
-    while (n && static_cast<unsigned>(m_policy.space()
-                                          .impl_internal_space_instance()
-                                          ->m_maxShmemPerBlock) <
-                    hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                             WorkTag>(f, n)) {
-      n >>= 1;
-    }
-    return n;
-  }
+  virtual inline unsigned local_block_size(const FunctorType& f) = 0;
 
   inline void impl_execute() {
     const index_type nwork = m_policy.end() - m_policy.begin();
@@ -649,7 +637,11 @@ class ParallelScanHIPBase {
       const int gridMaxComputeCapability_2x = 0x01fff;
 
       const int block_size = static_cast<int>(local_block_size(m_functor));
-      KOKKOS_ASSERT(block_size > 0);
+      if (block_size == 0) {
+        Kokkos::Impl::throw_runtime_exception(
+            std::string("Kokkos::Impl::ParallelScan< HIP > could not find a "
+                        "valid execution configuration."));
+      }
 
       const int grid_max =
           std::min(block_size * block_size, gridMaxComputeCapability_2x);
@@ -674,15 +666,16 @@ class ParallelScanHIPBase {
       const int shmem = ValueTraits::value_size(m_functor) * (block_size + 2);
 
       m_final = false;
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelScanHIPBase,
-                                                    LaunchBounds>(
+      // these ones are OK to be just the base because the specializations
+      // do not modify the kernel at all
+      using DriverType = ParallelScanHIPBase<FunctorType, Traits...>;
+      Kokkos::Experimental::Impl::hip_parallel_launch<DriverType, LaunchBounds>(
           *this, grid, block, shmem,
           m_policy.space().impl_internal_space_instance(),
           false);  // copy to device and execute
 
       m_final = true;
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelScanHIPBase,
-                                                    LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<DriverType, LaunchBounds>(
           *this, grid, block, shmem,
           m_policy.space().impl_internal_space_instance(),
           false);  // copy to device and execute
@@ -690,13 +683,17 @@ class ParallelScanHIPBase {
   }
 
   ParallelScanHIPBase(const FunctorType& arg_functor, const Policy& arg_policy)
-      : m_functor(arg_functor), m_policy(arg_policy) {}
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}
 };
 
 template <class FunctorType, class... Traits>
 class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
                    Kokkos::Experimental::HIP>
-    : private ParallelScanHIPBase<FunctorType, Traits...> {
+    : public ParallelScanHIPBase<FunctorType, Traits...> {
  public:
   using Base = ParallelScanHIPBase<FunctorType, Traits...>;
   using Base::operator();
@@ -706,6 +703,23 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
   ParallelScan(const FunctorType& arg_functor,
                const typename Base::Policy& arg_policy)
       : Base(arg_functor, arg_policy) {}
+
+  inline unsigned local_block_size(const FunctorType& f) {
+    // blockDim.y must be power of two = 128 (2 warps) or 256 (4 warps) or
+    // 512 (8 warps) gridDim.x <= blockDim.y * blockDim.y
+
+    const auto& instance =
+        Base::m_policy.space().impl_internal_space_instance();
+    auto shmem_functor = [&f](unsigned n) {
+      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                      typename Base::WorkTag>(
+          f, n);
+    };
+    using DriverType = ParallelScan<FunctorType, typename Base::Policy,
+                                    Kokkos::Experimental::HIP>;
+    return Kokkos::Experimental::Impl::hip_get_preferred_blocksize<
+        DriverType, typename Base::LaunchBounds>(instance, shmem_functor);
+  }
 };
 
 //----------------------------------------------------------------------------
@@ -713,7 +727,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
 template <class FunctorType, class ReturnType, class... Traits>
 class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
                             ReturnType, Kokkos::Experimental::HIP>
-    : private ParallelScanHIPBase<FunctorType, Traits...> {
+    : public ParallelScanHIPBase<FunctorType, Traits...> {
  public:
   using Base = ParallelScanHIPBase<FunctorType, Traits...>;
   using Base::operator();
@@ -737,6 +751,24 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
                         const typename Base::Policy& arg_policy,
                         ReturnType& arg_returnvalue)
       : Base(arg_functor, arg_policy), m_returnvalue(arg_returnvalue) {}
+
+  inline unsigned local_block_size(const FunctorType& f) {
+    // blockDim.y must be power of two = 128 (2 warps) or 256 (4 warps) or
+    // 512 (8 warps) gridDim.x <= blockDim.y * blockDim.y
+
+    const auto& instance =
+        Base::m_policy.space().impl_internal_space_instance();
+    auto shmem_functor = [&f](unsigned n) {
+      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                      typename Base::WorkTag>(
+          f, n);
+    };
+    using DriverType =
+        ParallelScanWithTotal<FunctorType, typename Base::Policy, ReturnType,
+                              Kokkos::Experimental::HIP>;
+    return Kokkos::Experimental::Impl::hip_get_preferred_blocksize<
+        DriverType, typename Base::LaunchBounds>(instance, shmem_functor);
+  }
 };
 
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
index 96c3ff2a751027a4eb05b03c99487207c9acf708..b794f5bc037111a8774ed23d1181326d3fa23b51 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
@@ -56,20 +56,20 @@
 
 namespace Kokkos {
 namespace Impl {
+
 template <typename... Properties>
 class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
     : public PolicyTraits<Properties...> {
  public:
   using execution_policy = TeamPolicyInternal;
 
-  using traits = PolicyTraits<Properties...>;
+  using traits    = PolicyTraits<Properties...>;
+  using BlockType = Kokkos::Experimental::Impl::BlockType;
 
   template <typename ExecSpace, typename... OtherProperties>
   friend class TeamPolicyInternal;
 
  private:
-  static int constexpr MAX_WARP = 8;
-
   typename traits::execution_space m_space;
   int m_league_size;
   int m_team_size;
@@ -101,17 +101,9 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
   template <typename FunctorType>
   int team_size_max(FunctorType const& f, ParallelForTag const&) const {
     using closure_type =
-        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...> >;
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type,
-        typename traits::launch_bounds>::get_hip_func_attributes();
-    int const block_size = ::Kokkos::Experimental::Impl::hip_get_max_block_size<
-        FunctorType, typename traits::launch_bounds>(
-        space().impl_internal_space_instance(), attr, f,
-        static_cast<size_t>(impl_vector_length()),
-        static_cast<size_t>(team_scratch_size(0)) + 2 * sizeof(double),
-        static_cast<size_t>(thread_scratch_size(0)) + sizeof(double));
-    return block_size / impl_vector_length();
+        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
+
+    return internal_team_size_common<BlockType::Max, closure_type>(f);
   }
 
   template <class FunctorType>
@@ -129,8 +121,8 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
     return internal_team_size_max<closure_type>(f);
   }
 
-  template <class FunctorType, class ReducerType>
-  inline int team_size_max(const FunctorType& f, const ReducerType& /*r*/,
+  template <typename FunctorType, typename ReducerType>
+  inline int team_size_max(const FunctorType& f, const ReducerType&,
                            const ParallelReduceTag&) const {
     using closure_type =
         Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
@@ -141,17 +133,9 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
   template <typename FunctorType>
   int team_size_recommended(FunctorType const& f, ParallelForTag const&) const {
     using closure_type =
-        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...> >;
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type,
-        typename traits::launch_bounds>::get_hip_func_attributes();
-    int const block_size = ::Kokkos::Experimental::Impl::hip_get_opt_block_size<
-        FunctorType, typename traits::launch_bounds>(
-        space().impl_internal_space_instance(), attr, f,
-        static_cast<size_t>(impl_vector_length()),
-        static_cast<size_t>(team_scratch_size(0)) + 2 * sizeof(double),
-        static_cast<size_t>(thread_scratch_size(0)) + sizeof(double));
-    return block_size / impl_vector_length();
+        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
+
+    return internal_team_size_common<BlockType::Preferred, closure_type>(f);
   }
 
   template <typename FunctorType>
@@ -169,7 +153,7 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
     return internal_team_size_recommended<closure_type>(f);
   }
 
-  template <class FunctorType, class ReducerType>
+  template <typename FunctorType, typename ReducerType>
   int team_size_recommended(FunctorType const& f, ReducerType const&,
                             ParallelReduceTag const&) const {
     using closure_type =
@@ -177,6 +161,7 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
                              ReducerType>;
     return internal_team_size_recommended<closure_type>(f);
   }
+
   inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
   inline bool impl_auto_team_size() const { return m_tune_team_size; }
   static int vector_length_max() {
@@ -211,7 +196,10 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
   inline void impl_set_vector_length(size_t size) { m_vector_length = size; }
   inline void impl_set_team_size(size_t size) { m_team_size = size; }
   int impl_vector_length() const { return m_vector_length; }
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   KOKKOS_DEPRECATED int vector_length() const { return impl_vector_length(); }
+#endif
 
   int team_size() const { return m_team_size; }
 
@@ -266,7 +254,8 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
           "space.");
 
     // Make sure total block size is permissible
-    if (m_team_size * m_vector_length > 1024) {
+    if (m_team_size * m_vector_length >
+        ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock) {
       Impl::throw_runtime_exception(
           std::string("Kokkos::TeamPolicy< HIP > the team size is too large. "
                       "Team size x vector length must be smaller than 1024."));
@@ -363,26 +352,84 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
   using member_type = Kokkos::Impl::HIPTeamMember;
 
  protected:
-  template <class ClosureType, class FunctorType, class BlockSizeCallable>
-  int internal_team_size_common(const FunctorType& f,
-                                BlockSizeCallable&& block_size_callable) const {
-    using closure_type = ClosureType;
+  template <BlockType BlockSize, class ClosureType, class FunctorType>
+  int internal_team_size_common(const FunctorType& f) const {
+    // FIXME_HIP: this could be unified with the
+    // internal_team_size_common_reduce
+    //            once we can turn c++17 constexpr on by default.
+    //            The problem right now is that we can't turn off the evaluation
+    //            of the functor_value_traits's valuesize / StaticValueSize
+
+    const unsigned shmem_block  = team_scratch_size(0) + 2 * sizeof(double);
+    const unsigned shmem_thread = thread_scratch_size(0) + sizeof(double);
+    const int vector_length     = impl_vector_length();
+
+    const auto functor = [&f, shmem_block, shmem_thread, vector_length](
+                             const hipFuncAttributes& attr, int block_size) {
+      int functor_shmem =
+          ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
+              f, block_size / vector_length);
+      return shmem_block + shmem_thread * (block_size / vector_length) +
+             functor_shmem + attr.sharedSizeBytes;
+    };
+    int block_size;
+    // FIXME_HIP - could be if constexpr for c++17
+    if (BlockSize == BlockType::Max) {
+      block_size = ::Kokkos::Experimental::Impl::hip_get_max_team_blocksize<
+          ClosureType, typename traits::launch_bounds>(
+          space().impl_internal_space_instance(), functor);
+    } else {
+      block_size =
+          ::Kokkos::Experimental::Impl::hip_get_preferred_team_blocksize<
+              ClosureType, typename traits::launch_bounds>(
+              space().impl_internal_space_instance(), functor);
+    }
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelFor< HIP > could not find a valid "
+                      "team size."));
+    }
+    return block_size / impl_vector_length();
+  }
+
+  template <BlockType BlockSize, class ClosureType, class FunctorType>
+  int internal_team_size_common_reduce(const FunctorType& f) const {
     using functor_value_traits =
         Impl::FunctorValueTraits<FunctorType, typename traits::work_tag>;
 
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type,
-        typename traits::launch_bounds>::get_hip_func_attributes();
-    const int block_size = std::forward<BlockSizeCallable>(block_size_callable)(
-        space().impl_internal_space_instance(), attr, f,
-        static_cast<size_t>(impl_vector_length()),
-        static_cast<size_t>(team_scratch_size(0)) + 2 * sizeof(double),
-        static_cast<size_t>(thread_scratch_size(0)) + sizeof(double) +
-            ((functor_value_traits::StaticValueSize != 0)
-                 ? 0
-                 : functor_value_traits::value_size(f)));
-    KOKKOS_ASSERT(block_size > 0);
+    const unsigned shmem_block  = team_scratch_size(0) + 2 * sizeof(double);
+    const unsigned shmem_thread = thread_scratch_size(0) + sizeof(double) +
+                                  ((functor_value_traits::StaticValueSize != 0)
+                                       ? 0
+                                       : functor_value_traits::value_size(f));
+    const int vector_length = impl_vector_length();
+
+    const auto functor = [&f, shmem_block, shmem_thread, vector_length](
+                             const hipFuncAttributes& attr, int block_size) {
+      int functor_shmem =
+          ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
+              f, block_size / vector_length);
+      return shmem_block + shmem_thread * (block_size / vector_length) +
+             functor_shmem + attr.sharedSizeBytes;
+    };
+    int block_size;
+    // FIXME_HIP - could be if constexpr for c++17
+    if (BlockSize == BlockType::Max) {
+      block_size = ::Kokkos::Experimental::Impl::hip_get_max_team_blocksize<
+          ClosureType, typename traits::launch_bounds>(
+          space().impl_internal_space_instance(), functor);
+    } else {
+      block_size =
+          ::Kokkos::Experimental::Impl::hip_get_preferred_team_blocksize<
+              ClosureType, typename traits::launch_bounds>(
+              space().impl_internal_space_instance(), functor);
+    }
 
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a "
+                      "valid team size."));
+    }
     // Currently we require Power-of-2 team size for reductions.
     int p2 = 1;
     while (p2 <= block_size) p2 *= 2;
@@ -392,16 +439,13 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
 
   template <class ClosureType, class FunctorType>
   int internal_team_size_max(const FunctorType& f) const {
-    return internal_team_size_common<ClosureType>(
-        f, ::Kokkos::Experimental::Impl::hip_get_max_block_size<
-               FunctorType, typename traits::launch_bounds>);
+    return internal_team_size_common_reduce<BlockType::Max, ClosureType>(f);
   }
 
   template <class ClosureType, class FunctorType>
   int internal_team_size_recommended(const FunctorType& f) const {
-    return internal_team_size_common<ClosureType>(
-        f, ::Kokkos::Experimental::Impl::hip_get_opt_block_size<
-               FunctorType, typename traits::launch_bounds>);
+    return internal_team_size_common_reduce<BlockType::Preferred, ClosureType>(
+        f);
   }
 };
 
@@ -505,7 +549,11 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     dim3 const block(static_cast<int>(m_vector_size),
                      static_cast<int>(m_team_size), 1);
 
-    ::Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, launch_bounds>(
+    using closure_type =
+        ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
+                    Kokkos::Experimental::HIP>;
+    ::Kokkos::Experimental::Impl::hip_parallel_launch<closure_type,
+                                                      launch_bounds>(
         *this, grid, block, shmem_size_total,
         m_policy.space().impl_internal_space_instance(),
         true);  // copy to device and execute
@@ -520,17 +568,9 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_scratch_lock(m_policy.space()
                            .impl_internal_space_instance()
                            ->m_team_scratch_mutex) {
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        ParallelFor, launch_bounds>::get_hip_func_attributes();
-    m_team_size =
-        m_team_size >= 0
-            ? m_team_size
-            : ::Kokkos::Experimental::Impl::hip_get_opt_block_size<
-                  FunctorType, launch_bounds>(
-                  m_policy.space().impl_internal_space_instance(), attr,
-                  m_functor, m_vector_size, m_policy.team_scratch_size(0),
-                  m_policy.thread_scratch_size(0)) /
-                  m_vector_size;
+    m_team_size = m_team_size >= 0 ? m_team_size
+                                   : arg_policy.team_size_recommended(
+                                         arg_functor, ParallelForTag());
 
     m_shmem_begin = (sizeof(double) * (m_team_size + 2));
     m_shmem_size =
@@ -556,23 +596,12 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     int const shmem_size_total = m_shmem_begin + m_shmem_size;
     if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
         shmem_size_total) {
-      printf(
-          "%i %i\n",
-          m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock,
-          shmem_size_total);
       Kokkos::Impl::throw_runtime_exception(std::string(
           "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory"));
     }
 
-    if (static_cast<int>(m_team_size) >
-        static_cast<int>(
-            ::Kokkos::Experimental::Impl::hip_get_max_block_size<FunctorType,
-                                                                 launch_bounds>(
-                m_policy.space().impl_internal_space_instance(), attr,
-                arg_functor, arg_policy.impl_vector_length(),
-                arg_policy.team_scratch_size(0),
-                arg_policy.thread_scratch_size(0)) /
-            arg_policy.impl_vector_length())) {
+    size_t max_size = arg_policy.team_size_max(arg_functor, ParallelForTag());
+    if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) {
       Kokkos::Impl::throw_runtime_exception(std::string(
           "Kokkos::Impl::ParallelFor< HIP > requested too large team size."));
     }
@@ -839,8 +868,11 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
       }
       const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
 
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelReduce,
-                                                    launch_bounds>(
+      using closure_type =
+          ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
+                         ReducerType, Kokkos::Experimental::HIP>;
+      Kokkos::Experimental::Impl::hip_parallel_launch<closure_type,
+                                                      launch_bounds>(
           *this, grid, block, shmem_size_total,
           m_policy.space().impl_internal_space_instance(),
           true);  // copy to device and execute
@@ -890,17 +922,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_scratch_lock(m_policy.space()
                            .impl_internal_space_instance()
                            ->m_team_scratch_mutex) {
-    hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
-        ParallelReduce, launch_bounds>::get_hip_func_attributes();
-    m_team_size =
-        m_team_size >= 0
-            ? m_team_size
-            : Kokkos::Experimental::Impl::hip_get_opt_block_size<FunctorType,
-                                                                 launch_bounds>(
-                  m_policy.space().impl_internal_space_instance(), attr,
-                  m_functor, m_vector_size, m_policy.team_scratch_size(0),
-                  m_policy.thread_scratch_size(0)) /
-                  m_vector_size;
+    m_team_size = m_team_size >= 0 ? m_team_size
+                                   : arg_policy.team_size_recommended(
+                                         arg_functor, ParallelReduceTag());
 
     m_team_begin =
         UseShflReduction
@@ -958,8 +982,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                       "L0 scratch memory"));
     }
 
-    if (static_cast<int>(m_team_size) >
-        arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) {
+    size_t max_size =
+        arg_policy.team_size_max(arg_functor, ParallelReduceTag());
+    if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) {
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelReduce< HIP > requested too "
                       "large team size."));
@@ -992,18 +1017,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_scratch_lock(m_policy.space()
                            .impl_internal_space_instance()
                            ->m_team_scratch_mutex) {
-    hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
-        ParallelReduce, launch_bounds>::get_hip_func_attributes();
-    m_team_size =
-        m_team_size >= 0
-            ? m_team_size
-            : Kokkos::Experimental::Impl::hip_get_opt_block_size<FunctorType,
-                                                                 launch_bounds>(
-                  m_policy.space().impl_internal_space_instance(), attr,
-                  m_functor, m_vector_size, m_policy.team_scratch_size(0),
-                  m_policy.thread_scratch_size(0)) /
-                  m_vector_size;
-
+    m_team_size = m_team_size >= 0
+                      ? m_team_size
+                      : arg_policy.team_size_recommended(arg_functor, reducer,
+                                                         ParallelReduceTag());
     m_team_begin =
         UseShflReduction
             ? 0
@@ -1046,7 +1063,6 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // upon team size.
 
     const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
-
     if ((!Kokkos::Impl::is_integral_power_of_two(m_team_size) &&
          !UseShflReduction) ||
         m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
@@ -1054,8 +1070,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size"));
     }
-    if (static_cast<int>(m_team_size) >
-        arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) {
+
+    size_t max_size =
+        arg_policy.team_size_max(arg_functor, reducer, ParallelReduceTag());
+    if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) {
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelReduce< HIP > requested too "
                       "large team size."));
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
index 15ca089d14740b6a2c42c69945a17a0c7bfa1bcc..e25ebe2ab355e626273aeff34615db45aa3465c7 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
@@ -67,102 +67,32 @@ namespace {
 hipStream_t get_deep_copy_stream() {
   static hipStream_t s = nullptr;
   if (s == nullptr) {
-    HIP_SAFE_CALL(hipStreamCreate(&s));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&s));
   }
   return s;
 }
 }  // namespace
 
-DeepCopy<Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
+void DeepCopyHIP(void* dst, void const* src, size_t n) {
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
 }
 
-DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(const Kokkos::Experimental::HIP&
-                                                  instance,
-                                              void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIP>::
-    DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
-             const void* src, size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace, Kokkos::Experimental::HIP>::
-    DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
-             const void* src, size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIPHostPinnedSpace, Kokkos::Experimental::HIP>::
-    DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
-             const void* src, size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(const Kokkos::Experimental::HIP&
-                                                  instance,
-                                              void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(const Kokkos::Experimental::HIP&
-                                                  instance,
-                                              void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(
+void DeepCopyAsyncHIP(const Kokkos::Experimental::HIP& instance, void* dst,
+                      void const* src, size_t n) {
+  KOKKOS_IMPL_HIP_SAFE_CALL(
       hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
 }
 
 void DeepCopyAsyncHIP(void* dst, void const* src, size_t n) {
   hipStream_t s = get_deep_copy_stream();
-  HIP_SAFE_CALL(hipMemcpyAsync(dst, src, n, hipMemcpyDefault, s));
-  HIP_SAFE_CALL(hipStreamSynchronize(s));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyAsync(dst, src, n, hipMemcpyDefault, s));
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::HIP>(
+      "Kokkos::Impl::DeepCopyAsyncHIP: Post Deep Copy Fence on Deep-Copy "
+      "stream",
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          DeepCopyResourceSynchronization,
+      [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(s)); });
 }
 
 }  // namespace Impl
@@ -171,6 +101,7 @@ void DeepCopyAsyncHIP(void* dst, void const* src, size_t n) {
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 namespace Kokkos {
 
 KOKKOS_DEPRECATED void Experimental::HIPSpace::access_error() {
@@ -188,6 +119,7 @@ KOKKOS_DEPRECATED void Experimental::HIPSpace::access_error(const void* const) {
 }
 
 }  // namespace Kokkos
+#endif
 
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
@@ -283,7 +215,7 @@ void HIPSpace::impl_deallocate(
     Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
                                       reported_size);
   }
-  HIP_SAFE_CALL(hipFree(arg_alloc_ptr));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(arg_alloc_ptr));
 }
 
 void HIPHostPinnedSpace::deallocate(void* const arg_alloc_ptr,
@@ -307,7 +239,7 @@ void HIPHostPinnedSpace::impl_deallocate(
     Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
                                       reported_size);
   }
-  HIP_SAFE_CALL(hipHostFree(arg_alloc_ptr));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(arg_alloc_ptr));
 }
 
 }  // namespace Experimental
@@ -427,23 +359,42 @@ HIP::HIP()
       "HIP instance constructor");
 }
 
-HIP::HIP(hipStream_t const stream)
+HIP::HIP(hipStream_t const stream, bool manage_stream)
     : m_space_instance(new Impl::HIPInternal, [](Impl::HIPInternal* ptr) {
         ptr->finalize();
         delete ptr;
       }) {
   Impl::HIPInternal::singleton().verify_is_initialized(
       "HIP instance constructor");
-  m_space_instance->initialize(Impl::HIPInternal::singleton().m_hipDev, stream);
+  m_space_instance->initialize(Impl::HIPInternal::singleton().m_hipDev, stream,
+                               manage_stream);
 }
 
 void HIP::print_configuration(std::ostream& s, const bool) {
   Impl::HIPInternal::singleton().print_configuration(s);
 }
 
-void HIP::impl_static_fence() { HIP_SAFE_CALL(hipDeviceSynchronize()); }
+uint32_t HIP::impl_instance_id() const noexcept {
+  return m_space_instance->impl_get_instance_id();
+}
+void HIP::impl_static_fence(const std::string& name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::HIP>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); });
+}
+void HIP::impl_static_fence() {
+  impl_static_fence("Kokkos::HIP::impl_static_fence: Unnamed Static Fence");
+}
 
-void HIP::fence() const { m_space_instance->fence(); }
+void HIP::fence(const std::string& name) const {
+  m_space_instance->fence(name);
+}
+void HIP::fence() const {
+  fence("Kokkos::HIP::fence(): Unnamed Instance Fence");
+}
 
 hipStream_t HIP::hip_stream() const { return m_space_instance->m_stream; }
 
@@ -489,6 +440,9 @@ void HIPSpaceInitializer::finalize(const bool all_spaces) {
 void HIPSpaceInitializer::fence() {
   Kokkos::Experimental::HIP::impl_static_fence();
 }
+void HIPSpaceInitializer::fence(const std::string& name) {
+  Kokkos::Experimental::HIP::impl_static_fence(name);
+}
 
 void HIPSpaceInitializer::print_configuration(std::ostream& msg,
                                               const bool detail) {
diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp
index fe52886ced7c7a72454f9e731b3b5b4778f90073..fb67a25c5e7f5e3b0a48118ffe14372f0b1cd2dc 100644
--- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp
+++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp
@@ -316,198 +316,6 @@ class HIPTeamMember {
 #endif
   }
 
-  //--------------------------------------------------------------------------
-  /**\brief  Global reduction across all blocks
-   *
-   *  Return !0 if reducer contains the final value
-   */
-  template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
-      typename std::enable_if<is_reducer<ReducerType>::value, int>::type
-      global_reduce(ReducerType const& reducer, int* const global_scratch_flags,
-                    void* const global_scratch_space, void* const shmem,
-                    int const shmem_size) {
-#ifdef __HIP_DEVICE_COMPILE__
-    using value_type   = typename ReducerType::value_type;
-    using pointer_type = value_type volatile*;
-
-    // Number of shared memory entries for the reduction:
-    const int nsh = shmem_size / sizeof(value_type);
-
-    // Number of HIP threads in the block, rank within the block
-    const int nid = blockDim.x * blockDim.y * blockDim.z;
-    const int tid =
-        threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
-
-    // Reduces within block using all available shared memory
-    // Contributes if it is the root "vector lane"
-
-    // wn == number of warps in the block
-    // wx == which lane within the warp
-    // wy == which warp within the block
-
-    const int wn = (nid + Experimental::Impl::HIPTraits::WarpIndexMask) >>
-                   Experimental::Impl::HIPTraits::WarpIndexShift;
-    const int wx = tid & Experimental::Impl::HIPTraits::WarpIndexMask;
-    const int wy = tid >> Experimental::Impl::HIPTraits::WarpIndexShift;
-
-    //------------------------
-    {  // Intra warp shuffle reduction from contributing HIP threads
-
-      value_type tmp(reducer.reference());
-
-      int constexpr warp_size =
-          ::Kokkos::Experimental::Impl::HIPTraits::WarpSize;
-      for (int i = warp_size; static_cast<int>(blockDim.x) <= (i >>= 1);) {
-        Experimental::Impl::in_place_shfl_down(reducer.reference(), tmp, i,
-                                               warp_size);
-
-        // Root of each vector lane reduces "thread" contribution
-        if (0 == threadIdx.x && wx < i) {
-          reducer.join(&tmp, reducer.data());
-        }
-      }
-
-      // Reduce across warps using shared memory.
-      // Number of warps may not be power of two.
-
-      __syncthreads();  // Wait before shared data write
-
-      // Number of shared memory entries for the reduction
-      // is at most one per warp
-      const int nentry = wn < nsh ? wn : nsh;
-
-      if (0 == wx && wy < nentry) {
-        // Root thread of warp 'wy' has warp's value to contribute
-        (reinterpret_cast<value_type*>(shmem))[wy] = tmp;
-      }
-
-      __syncthreads();  // Wait for write to be visible to block
-
-      // When more warps than shared entries
-      // then warps must take turns joining their contribution
-      // to the designated shared memory entry.
-      for (int i = nentry; i < wn; i += nentry) {
-        const int k = wy - i;
-
-        if (0 == wx && i <= wy && k < nentry) {
-          // Root thread of warp 'wy' has warp's value to contribute
-          reducer.join((reinterpret_cast<value_type*>(shmem)) + k, &tmp);
-        }
-
-        __syncthreads();  // Wait for write to be visible to block
-      }
-
-      // One warp performs the inter-warp reduction:
-
-      if (0 == wy) {
-        // Start fan-in at power of two covering nentry
-
-        for (int i = (1 << (warp_size - __clz(nentry - 1))); (i >>= 1);) {
-          const int k = wx + i;
-          if (wx < i && k < nentry) {
-            reducer.join((reinterpret_cast<pointer_type>(shmem)) + wx,
-                         (reinterpret_cast<pointer_type>(shmem)) + k);
-            __threadfence_block();  // Wait for write to be visible to warp
-          }
-        }
-      }
-    }
-    //------------------------
-    {  // Write block's value to global_scratch_memory
-
-      int last_block = 0;
-
-      if (0 == wx) {
-        reducer.copy((reinterpret_cast<pointer_type>(global_scratch_space)) +
-                         blockIdx.x * reducer.length(),
-                     reducer.data());
-
-        __threadfence();  // Wait until global write is visible.
-
-        last_block = static_cast<int>(gridDim.x) ==
-                     1 + Kokkos::atomic_fetch_add(global_scratch_flags, 1);
-
-        // If last block then reset count
-        if (last_block) *global_scratch_flags = 0;
-      }
-
-      // FIXME hip does not support __syncthreads_or so we need to do it by hand
-      // last_block = __syncthreads_or(last_block);
-
-      __shared__ int last_block_shared;
-      if (last_block) last_block_shared = last_block;
-      __threadfence_block();
-
-      if (!last_block_shared) return 0;
-    }
-    //------------------------
-    // Last block reads global_scratch_memory into shared memory.
-
-    const int nentry = nid < gridDim.x ? (nid < nsh ? nid : nsh)
-                                       : (gridDim.x < nsh ? gridDim.x : nsh);
-
-    // nentry = min( nid , nsh , gridDim.x )
-
-    // whole block reads global memory into shared memory:
-
-    if (tid < nentry) {
-      const int offset = tid * reducer.length();
-
-      reducer.copy(
-          (reinterpret_cast<pointer_type>(shmem)) + offset,
-          (reinterpret_cast<pointer_type>(global_scratch_space)) + offset);
-
-      for (int i = nentry + tid; i < static_cast<int>(gridDim.x); i += nentry) {
-        reducer.join((reinterpret_cast<pointer_type>(shmem)) + offset,
-                     (reinterpret_cast<pointer_type>(global_scratch_space)) +
-                         i * reducer.length());
-      }
-    }
-
-    __syncthreads();  // Wait for writes to be visible to block
-
-    if (0 == wy) {
-      // Iterate to reduce shared memory to single warp fan-in size
-
-      int constexpr warp_size =
-          ::Kokkos::Experimental::Impl::HIPTraits::WarpSize;
-      const int nreduce = warp_size < nentry ? warp_size : nentry;
-
-      if (wx < nreduce && nreduce < nentry) {
-        for (int i = nreduce + wx; i < nentry; i += nreduce) {
-          reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + i);
-        }
-        __threadfence_block();  // Wait for writes to be visible to warp
-      }
-
-      // Start fan-in at power of two covering nentry
-
-      for (int i = (1 << (warp_size - __clz(nreduce - 1))); (i >>= 1);) {
-        const int k = wx + i;
-        if (wx < i && k < nreduce) {
-          reducer.join((reinterpret_cast<pointer_type>(shmem)) + wx,
-                       (reinterpret_cast<pointer_type>(shmem)) + k);
-          __threadfence_block();  // Wait for writes to be visible to warp
-        }
-      }
-
-      if (0 == wx) {
-        reducer.copy(reducer.data(), reinterpret_cast<pointer_type>(shmem));
-        return 1;
-      }
-    }
-    return 0;
-#else
-    (void)reducer;
-    (void)global_scratch_flags;
-    (void)global_scratch_space;
-    (void)shmem;
-    (void)shmem_size;
-    return 0;
-#endif
-  }
-
   //----------------------------------------
   // Private for the driver
 
diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp b/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp
index 910d5e52e6ac62e290f4d7c918217aa518ec3ec7..d9cb66e11f4638c8635d0e9e33d7cbda67e1cada 100644
--- a/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp
+++ b/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp
@@ -191,6 +191,9 @@ void HPXSpaceInitializer::finalize(const bool all_spaces) {
 }
 
 void HPXSpaceInitializer::fence() { Kokkos::Experimental::HPX().fence(); }
+void HPXSpaceInitializer::fence(const std::string &name) {
+  Kokkos::Experimental::HPX().fence(name);
+}
 
 void HPXSpaceInitializer::print_configuration(std::ostream &msg,
                                               const bool detail) {
diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp b/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp
index df09e026fd9b45bc1c4f7d0c55e5ae10d336ad72..7bb3ca5d007023d99314c8d45de748da7836136d 100644
--- a/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp
+++ b/packages/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp
@@ -82,7 +82,9 @@ class TaskQueueSpecialization<
     task_queue.scheduler = &scheduler;
     Kokkos::Impl::dispatch_execute_task(&task_queue,
                                         Kokkos::Experimental::HPX());
-    Kokkos::Experimental::HPX().fence();
+    Kokkos::Experimental::HPX().fence(
+        "Kokkos::Impl::TaskQueueSpecialization<SimpleTask>::execute: fence "
+        "after task execution");
   }
 
   // Must provide task queue execution function
@@ -214,7 +216,7 @@ class TaskQueueSpecializationConstrained<
     task_queue.scheduler = &scheduler;
     Kokkos::Impl::dispatch_execute_task(&task_queue,
                                         Kokkos::Experimental::HPX());
-    Kokkos::Experimental::HPX().fence();
+    Kokkos::Experimental::HPX().fence()"Kokkos::Impl::TaskQueueSpecializationConstrained::execute: fence after task execution";
   }
 
   // Must provide task queue execution function
diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp b/packages/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp
index 527fe12ad937f9b89029f12d5c64044f40671572..d7e13e28f054569926382933232b7119ca96a192 100644
--- a/packages/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp
+++ b/packages/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp
@@ -79,7 +79,9 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
  public:
   void execute() const {
     dispatch_execute_task(this, m_policy.space());
-    m_policy.space().fence();
+    m_policy.space().fence(
+        "Kokkos::Experimental::Impl::HPX::ParallelFor<WorkGraphPolicy>: fence "
+        "after kernel execution");
   }
 
   void execute_task() const {
diff --git a/packages/kokkos/core/src/KokkosExp_InterOp.hpp b/packages/kokkos/core/src/KokkosExp_InterOp.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..37c2088f88f08758f5f1585b7138f43dd73d54eb
--- /dev/null
+++ b/packages/kokkos/core/src/KokkosExp_InterOp.hpp
@@ -0,0 +1,147 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CORE_EXP_INTEROP_HPP
+#define KOKKOS_CORE_EXP_INTEROP_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <Kokkos_View.hpp>
+#include <impl/Kokkos_Utilities.hpp>
+#include <type_traits>
+
+namespace Kokkos {
+namespace Impl {
+
+// ------------------------------------------------------------------ //
+//  this is used to convert
+//      Kokkos::Device<ExecSpace, MemSpace> to MemSpace
+//
+template <typename Tp>
+struct device_memory_space {
+  using type = Tp;
+};
+
+template <typename ExecT, typename MemT>
+struct device_memory_space<Kokkos::Device<ExecT, MemT>> {
+  using type = MemT;
+};
+
+template <typename Tp>
+using device_memory_space_t = typename device_memory_space<Tp>::type;
+
+// ------------------------------------------------------------------ //
+//  this is the impl version which takes a view and converts to python
+//  view type
+//
+template <typename, typename...>
+struct python_view_type_impl;
+
+template <template <typename...> class ViewT, typename ValueT,
+          typename... Types>
+struct python_view_type_impl<ViewT<ValueT>, type_list<Types...>> {
+  using type = ViewT<ValueT, device_memory_space_t<Types>...>;
+};
+
+template <template <typename...> class ViewT, typename ValueT,
+          typename... Types>
+struct python_view_type_impl<ViewT<ValueT, Types...>>
+    : python_view_type_impl<ViewT<ValueT>,
+                            filter_type_list_t<is_default_memory_trait,
+                                               type_list<Types...>, false>> {};
+
+template <typename... T>
+using python_view_type_impl_t = typename python_view_type_impl<T...>::type;
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+namespace Kokkos {
+
+template <typename DataType, class... Properties>
+class DynRankView;
+
+namespace Impl {
+
+// Duplicate from the header file for DynRankView to avoid core depending on
+// containers.
+template <class>
+struct is_dyn_rank_view_dup : public std::false_type {};
+
+template <class D, class... P>
+struct is_dyn_rank_view_dup<Kokkos::DynRankView<D, P...>>
+    : public std::true_type {};
+
+}  // namespace Impl
+
+namespace Experimental {
+
+// ------------------------------------------------------------------ //
+//  this is used to extract the uniform type of a view
+//
+template <typename ViewT>
+struct python_view_type {
+  static_assert(
+      Kokkos::is_view<std::decay_t<ViewT>>::value ||
+          Kokkos::Impl::is_dyn_rank_view_dup<std::decay_t<ViewT>>::value,
+      "Error! python_view_type only supports Kokkos::View and "
+      "Kokkos::DynRankView");
+
+  using type =
+      Kokkos::Impl::python_view_type_impl_t<typename ViewT::array_type>;
+};
+
+template <typename ViewT>
+using python_view_type_t = typename python_view_type<ViewT>::type;
+
+template <typename Tp>
+auto as_python_type(Tp&& _v) {
+  using cast_type = python_view_type_t<Tp>;
+  return static_cast<cast_type>(std::forward<Tp>(_v));
+}
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
index b7d8e62f696073bfa4794b362401aaca288de021..dfae7451fc302362743c9349485ee574a15a2d76 100644
--- a/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
+++ b/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
@@ -48,6 +48,7 @@
 #include <initializer_list>
 
 #include <Kokkos_Layout.hpp>
+#include <Kokkos_Rank.hpp>
 #include <Kokkos_Array.hpp>
 #include <impl/KokkosExp_Host_IterateTile.hpp>
 #include <Kokkos_ExecPolicy.hpp>
@@ -78,22 +79,6 @@ struct default_inner_direction {
   static constexpr Iterate value = Iterate::Right;
 };
 
-// Iteration Pattern
-template <unsigned N, Iterate OuterDir = Iterate::Default,
-          Iterate InnerDir = Iterate::Default>
-struct Rank {
-  static_assert(N != 0u, "Kokkos Error: rank 0 undefined");
-  static_assert(N != 1u,
-                "Kokkos Error: rank 1 is not a multi-dimensional range");
-  static_assert(N < 7u, "Kokkos Error: Unsupported rank...");
-
-  using iteration_pattern = Rank<N, OuterDir, InnerDir>;
-
-  static constexpr int rank                = N;
-  static constexpr Iterate outer_direction = OuterDir;
-  static constexpr Iterate inner_direction = InnerDir;
-};
-
 namespace Impl {
 // NOTE the comparison below is encapsulated to silent warnings about pointless
 // comparison of unsigned integer with zero
@@ -397,13 +382,18 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
 
 }  // namespace Kokkos
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 // For backward compatibility
 namespace Kokkos {
 namespace Experimental {
-using Kokkos::Iterate;
-using Kokkos::MDRangePolicy;
-using Kokkos::Rank;
+using Iterate KOKKOS_DEPRECATED = Kokkos::Iterate;
+template <typename... Properties>
+using MDRangePolicy KOKKOS_DEPRECATED = Kokkos::MDRangePolicy<Properties...>;
+template <unsigned N, Kokkos::Iterate OuterDir = Kokkos::Iterate::Default,
+          Kokkos::Iterate InnerDir = Kokkos::Iterate::Default>
+using Rank KOKKOS_DEPRECATED = Kokkos::Rank<N, OuterDir, InnerDir>;
 }  // namespace Experimental
 }  // namespace Kokkos
+#endif
 
 #endif  // KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
diff --git a/packages/kokkos/core/src/Kokkos_Atomic.hpp b/packages/kokkos/core/src/Kokkos_Atomic.hpp
index 8cd60fa6bae993895ac901fbbab8eb532a6a0ded..a47208e97782ec424a2a96b5ec4de0c58fe2fef2 100644
--- a/packages/kokkos/core/src/Kokkos_Atomic.hpp
+++ b/packages/kokkos/core/src/Kokkos_Atomic.hpp
@@ -69,6 +69,60 @@
 #define KOKKOS_ATOMIC_HPP
 
 #include <Kokkos_Macros.hpp>
+
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+#define DESUL_HAVE_OPENMP_ATOMICS
+#endif
+#include <Kokkos_Atomics_Desul_Wrapper.hpp>
+#include <Kokkos_Atomics_Desul_Volatile_Wrapper.hpp>
+#include <impl/Kokkos_Utilities.hpp>
+
+// Helper functions for places where we really should have called SeqCst atomics
+// anyway These can go away when we call desul unconditionally Non-Desul
+// versions are below
+namespace Kokkos {
+namespace Impl {
+using desul::MemoryOrderSeqCst;
+using desul::MemoryScopeDevice;
+
+template <class T>
+KOKKOS_INLINE_FUNCTION void desul_atomic_dec(T* dest, MemoryOrderSeqCst,
+                                             MemoryScopeDevice) {
+  return desul::atomic_dec(const_cast<T*>(dest), desul::MemoryOrderSeqCst(),
+                           desul::MemoryScopeDevice());
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION void desul_atomic_inc(T* dest, MemoryOrderSeqCst,
+                                             MemoryScopeDevice) {
+  return desul::atomic_inc(const_cast<T*>(dest), desul::MemoryOrderSeqCst(),
+                           desul::MemoryScopeDevice());
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION T
+desul_atomic_exchange(T* dest, const Kokkos::Impl::identity_t<T> val,
+                      MemoryOrderSeqCst, MemoryScopeDevice) {
+  return desul::atomic_exchange(const_cast<T*>(dest), val,
+                                desul::MemoryOrderSeqCst(),
+                                desul::MemoryScopeDevice());
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION T desul_atomic_compare_exchange(
+    T* dest, Kokkos::Impl::identity_t<const T> compare,
+    Kokkos::Impl::identity_t<const T> val, MemoryOrderSeqCst,
+    MemoryScopeDevice) {
+  return desul::atomic_compare_exchange(dest, compare, val,
+                                        desul::MemoryOrderSeqCst(),
+                                        desul::MemoryScopeDevice());
+}
+
+}  // namespace Impl
+}  // namespace Kokkos
+#else
+
 #include <Kokkos_HostSpace.hpp>
 #include <impl/Kokkos_Traits.hpp>
 
@@ -326,4 +380,42 @@ inline const char* atomic_query_version() {
 
 //----------------------------------------------------------------------------
 
+// Helper functions for places where we really should have called SeqCst atomics
+// anyway These can go away when we call desul unconditionally
+namespace Kokkos {
+namespace Impl {
+struct MemoryOrderSeqCst {};
+struct MemoryScopeDevice {};
+
+template <class T>
+KOKKOS_INLINE_FUNCTION void desul_atomic_dec(T* dest, MemoryOrderSeqCst,
+                                             MemoryScopeDevice) {
+  return Kokkos::atomic_decrement(dest);
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION void desul_atomic_inc(T* dest, MemoryOrderSeqCst,
+                                             MemoryScopeDevice) {
+  return Kokkos::atomic_increment(dest);
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION T
+desul_atomic_exchange(T* dest, Kokkos::Impl::identity_t<const T> val,
+                      MemoryOrderSeqCst, MemoryScopeDevice) {
+  return Kokkos::atomic_exchange(dest, val);
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION T desul_atomic_compare_exchange(
+    T* dest, Kokkos::Impl::identity_t<const T> compare,
+    Kokkos::Impl::identity_t<const T> val, MemoryOrderSeqCst,
+    MemoryScopeDevice) {
+  return Kokkos::atomic_compare_exchange(dest, compare, val);
+}
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif /* !KOKKOS_ENABLE_IMPL_DESUL_ATOMICS */
 #endif /* KOKKOS_ATOMIC_HPP */
diff --git a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0bcb3ea388beeaf72c862f9519572e5d9e13a530
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp
@@ -0,0 +1,189 @@
+#ifndef KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_
+#define KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#include <desul/atomics.hpp>
+
+// clang-format off
+namespace Kokkos { 
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_load(volatile T* const dest) { return desul::atomic_load(const_cast<T*>(dest), desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_store(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_store(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// atomic_fetch_op
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_add (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_add (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+#ifdef DESUL_IMPL_ATOMIC_CUDA_USE_DOUBLE_ATOMICADD
+KOKKOS_INLINE_FUNCTION
+double atomic_fetch_add(volatile double* const dest, double val) {
+  #ifdef __CUDA_ARCH__
+  return atomicAdd(const_cast<double*>(dest),val);
+  #else
+  return desul::atomic_fetch_add (const_cast<double*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+  #endif
+};
+
+KOKKOS_INLINE_FUNCTION
+double atomic_fetch_sub(volatile double* const dest, double val) {
+  #ifdef __CUDA_ARCH__
+  return atomicAdd(const_cast<double*>(dest),-val);
+  #else
+  return desul::atomic_fetch_sub (const_cast<double*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+  #endif
+};
+#endif
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_sub (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_sub (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_max (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_max (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_min (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_min (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mul (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_mul (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_div (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_div (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mod (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_mod (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_and (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_and (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_or  (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_or  (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_xor (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_xor (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_nand(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_nand(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_lshift(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_lshift(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_rshift(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_rshift(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_inc(volatile T* const dest) { return desul::atomic_fetch_inc(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_dec(volatile T* const dest) { return desul::atomic_fetch_dec(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+
+// atomic_op_fetch
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_add_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_add_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_sub_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_sub_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_max_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_max_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_min_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_min_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_mul_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mul_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_div_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_div_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_mod_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mod_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_and_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_and_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_or_fetch  (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_or_fetch  (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_xor_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_xor_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_nand_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_nand_fetch(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_lshift_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_lshift_fetch(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_rshift_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_rshift_fetch(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_inc_fetch(volatile T* const dest) { return desul::atomic_inc_fetch(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_dec_fetch(volatile T* const dest) { return desul::atomic_dec_fetch(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+
+// atomic_op
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_add(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_add (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_sub(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_sub (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_mul(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mul (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_div(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_div (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_min(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_min (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_max(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_max (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// FIXME: Desul doesn't have atomic_and yet so call fetch_and
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_and(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { (void) desul::atomic_fetch_and (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// FIXME: Desul doesn't have atomic_or yet so call fetch_or
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_or (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { (void) desul::atomic_fetch_or  (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_inc(volatile T* const dest) { return desul::atomic_inc(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_dec(volatile T* const dest) { return desul::atomic_dec(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_increment(volatile T* const dest) { return desul::atomic_inc(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_decrement(volatile T* const dest) { return desul::atomic_dec(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// Exchange
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_exchange(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_exchange(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+bool atomic_compare_exchange_strong(volatile T* const dest, T& expected, const T desired) {
+  return desul::atomic_compare_exchange_strong(const_cast<T*>(dest),expected, desired,
+                  desul::MemoryOrderRelaxed(), desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+}
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange(volatile T* const dest, const T compare, const T desired) {
+  return desul::atomic_compare_exchange(const_cast<T*>(dest),compare, desired,
+                  desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+}
+
+}
+// clang-format on
+#endif  // KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#endif
diff --git a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3a182a6a22b56ca424d13e3d9f0835070f1cb2f6
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp
@@ -0,0 +1,271 @@
+#ifndef KOKKOS_DESUL_ATOMICS_WRAPPER_HPP_
+#define KOKKOS_DESUL_ATOMICS_WRAPPER_HPP_
+#include <Kokkos_Macros.hpp>
+
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#include <desul/atomics.hpp>
+
+#include <impl/Kokkos_Atomic_Memory_Order.hpp>
+#include <impl/Kokkos_Volatile_Load.hpp>
+
+// clang-format off
+namespace Kokkos {
+
+// FIXME: These functions don't have any use/test in unit tests ...
+// ==========================================================
+inline const char* atomic_query_version() { return "KOKKOS_DESUL_ATOMICS"; }
+
+#if defined(KOKKOS_COMPILER_GNU) && !defined(__PGIC__) && \
+    !defined(__CUDA_ARCH__)
+
+#define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) __builtin_prefetch(addr, 0, 0)
+#define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) __builtin_prefetch(addr, 1, 0)
+
+#else
+
+#define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) ((void)0)
+#define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) ((void)0)
+
+#endif
+// ============================================================
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_load(T* const dest) { return desul::atomic_load(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_store(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_store(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_assign(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { atomic_store(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+void memory_fence() {
+  desul::atomic_thread_fence(desul::MemoryOrderSeqCst(), desul::MemoryScopeDevice());
+}
+
+KOKKOS_INLINE_FUNCTION
+void load_fence() { return desul::atomic_thread_fence(desul::MemoryOrderAcquire(), desul::MemoryScopeDevice()); }
+
+KOKKOS_INLINE_FUNCTION
+void store_fence() { return desul::atomic_thread_fence(desul::MemoryOrderRelease(), desul::MemoryScopeDevice()); }
+
+// atomic_fetch_op
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_add (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_add (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+#ifdef DESUL_IMPL_ATOMIC_CUDA_USE_DOUBLE_ATOMICADD
+KOKKOS_INLINE_FUNCTION
+double atomic_fetch_add(double* const dest, double val) {
+  #ifdef __CUDA_ARCH__
+  return atomicAdd(dest,val);
+  #else
+  return desul::atomic_fetch_add (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+  #endif
+};
+
+KOKKOS_INLINE_FUNCTION
+double atomic_fetch_sub(double* const dest, double val) {
+  #ifdef __CUDA_ARCH__
+  return atomicAdd(dest,-val);
+  #else
+  return desul::atomic_fetch_sub (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+  #endif
+};
+#endif
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_sub (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_sub (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_max (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_max (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_min (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_min (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mul (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_mul (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_div (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_div (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mod (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_mod (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_and (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_and (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_or  (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_or  (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_xor (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_xor (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_nand(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_nand(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_lshift(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_lshift(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_rshift(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_rshift(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_inc(T* const dest) { return desul::atomic_fetch_inc(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_dec(T* const dest) { return desul::atomic_fetch_dec(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+
+// atomic_op_fetch
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_add_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_add_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_sub_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_sub_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_max_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_max_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_min_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_min_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_mul_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mul_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_div_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_div_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_mod_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mod_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_and_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_and_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_or_fetch  (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_or_fetch  (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_xor_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_xor_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_nand_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_nand_fetch(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_lshift_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_lshift_fetch(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_rshift_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_rshift_fetch(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_inc_fetch(T* const dest) { return desul::atomic_inc_fetch(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_dec_fetch(T* const dest) { return desul::atomic_dec_fetch(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+
+// atomic_op
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_add(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_add (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_sub(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_sub (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_mul(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mul (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_div(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_div (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_min(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_min (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_max(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_max (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// FIXME: Desul doesn't have atomic_and yet so call fetch_and
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_and(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { (void) desul::atomic_fetch_and (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// FIXME: Desul doesn't have atomic_or yet so call fetch_or
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_or(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val)  { (void) desul::atomic_fetch_or (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_inc(T* const dest) { return desul::atomic_inc(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_dec(T* const dest) { return desul::atomic_dec(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_increment(T* const dest) { return desul::atomic_inc(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_decrement(T* const dest) { return desul::atomic_dec(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// Exchange
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_exchange(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_exchange(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+bool atomic_compare_exchange_strong(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> expected, desul::Impl::dont_deduce_this_parameter_t<const T> desired) {
+  T expected_ref = expected;
+  return desul::atomic_compare_exchange_strong(dest, expected_ref, desired,
+                  desul::MemoryOrderRelaxed(), desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+}
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> compare, desul::Impl::dont_deduce_this_parameter_t<const T> desired) {
+  return desul::atomic_compare_exchange(dest, compare, desired,
+                  desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+}
+
+namespace Impl {
+
+  template<class MemoryOrder>
+  struct KokkosToDesulMemoryOrder;
+
+  template<>
+  struct KokkosToDesulMemoryOrder<memory_order_seq_cst_t> {
+    using type = desul::MemoryOrderSeqCst;
+  };
+  template<>
+  struct KokkosToDesulMemoryOrder<memory_order_acquire_t> {
+    using type = desul::MemoryOrderAcquire;
+  };
+  template<>
+  struct KokkosToDesulMemoryOrder<memory_order_release_t> {
+    using type = desul::MemoryOrderRelease;
+  };
+  template<>
+  struct KokkosToDesulMemoryOrder<memory_order_acq_rel_t> {
+    using type = desul::MemoryOrderAcqRel;
+  };
+  template<>
+  struct KokkosToDesulMemoryOrder<memory_order_relaxed_t> {
+    using type = desul::MemoryOrderRelaxed;
+  };
+  template<class T, class MemOrderSuccess, class MemOrderFailure> KOKKOS_INLINE_FUNCTION
+  bool atomic_compare_exchange_strong(T* const dest, T& expected, const T desired, MemOrderSuccess, MemOrderFailure) {
+    return desul::atomic_compare_exchange_strong(dest, expected, desired,
+                  typename KokkosToDesulMemoryOrder<MemOrderSuccess>::type(),
+                  typename KokkosToDesulMemoryOrder<MemOrderFailure>::type(),
+                  desul::MemoryScopeDevice());
+
+  }
+  template<class T, class MemoryOrder>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_load(const T* const src, MemoryOrder) {
+    return desul::atomic_load(src, typename KokkosToDesulMemoryOrder<MemoryOrder>::type(), desul::MemoryScopeDevice());
+  }
+  template<class T, class MemoryOrder>
+  KOKKOS_INLINE_FUNCTION
+  void atomic_store(T* const src, const T val, MemoryOrder) {
+    return desul::atomic_store(src, val, typename KokkosToDesulMemoryOrder<MemoryOrder>::type(), desul::MemoryScopeDevice());
+  }
+}
+
+}
+// clang-format on
+#endif  // KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#endif
diff --git a/packages/kokkos/core/src/Kokkos_Complex.hpp b/packages/kokkos/core/src/Kokkos_Complex.hpp
index 6578723fc8e5dab1e605b1a5dc80f1daf4b2ebfb..466903ab7d6626c0cd7ff97754594cc17933367e 100644
--- a/packages/kokkos/core/src/Kokkos_Complex.hpp
+++ b/packages/kokkos/core/src/Kokkos_Complex.hpp
@@ -77,7 +77,7 @@ class
 
   //! Default constructor (initializes both real and imaginary parts to zero).
   KOKKOS_DEFAULTED_FUNCTION
-  complex() noexcept = default;
+  complex() = default;
 
   //! Copy constructor.
   KOKKOS_DEFAULTED_FUNCTION
@@ -150,11 +150,11 @@ class
 
   //! The imaginary part of this complex number.
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14 RealType& imag() noexcept { return im_; }
+  constexpr RealType& imag() noexcept { return im_; }
 
   //! The real part of this complex number.
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14 RealType& real() noexcept { return re_; }
+  constexpr RealType& real() noexcept { return re_; }
 
   //! The imaginary part of this complex number.
   KOKKOS_INLINE_FUNCTION
@@ -166,41 +166,39 @@ class
 
   //! Set the imaginary part of this complex number.
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  void imag(RealType v) noexcept { im_ = v; }
+  constexpr void imag(RealType v) noexcept { im_ = v; }
 
   //! Set the real part of this complex number.
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  void real(RealType v) noexcept { re_ = v; }
+  constexpr void real(RealType v) noexcept { re_ = v; }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator+=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator+=(
       const complex<RealType>& src) noexcept {
     re_ += src.re_;
     im_ += src.im_;
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator+=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator+=(
       const RealType& src) noexcept {
     re_ += src;
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator-=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator-=(
       const complex<RealType>& src) noexcept {
     re_ -= src.re_;
     im_ -= src.im_;
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator-=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator-=(
       const RealType& src) noexcept {
     re_ -= src;
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator*=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator*=(
       const complex<RealType>& src) noexcept {
     const RealType realPart = re_ * src.re_ - im_ * src.im_;
     const RealType imagPart = re_ * src.im_ + im_ * src.re_;
@@ -209,7 +207,7 @@ class
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator*=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator*=(
       const RealType& src) noexcept {
     re_ *= src;
     im_ *= src;
@@ -217,7 +215,7 @@ class
   }
 
   // Conditional noexcept, just in case RType throws on divide-by-zero
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator/=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator/=(
       const complex<RealType>& y) noexcept(noexcept(RealType{} / RealType{})) {
     using Kokkos::Experimental::fabs;
     // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
@@ -244,8 +242,7 @@ class
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14
-  KOKKOS_INLINE_FUNCTION complex& operator/=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator/=(
       const std::complex<RealType>& y) noexcept(noexcept(RealType{} /
                                                          RealType{})) {
     using Kokkos::Experimental::fabs;
@@ -272,7 +269,7 @@ class
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator/=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator/=(
       const RealType& src) noexcept(noexcept(RealType{} / RealType{})) {
     re_ /= src;
     im_ /= src;
@@ -688,12 +685,24 @@ KOKKOS_INLINE_FUNCTION RealType imag(const complex<RealType>& x) noexcept {
   return x.imag();
 }
 
+template <class ArithmeticType>
+KOKKOS_INLINE_FUNCTION constexpr Impl::promote_t<ArithmeticType> imag(
+    ArithmeticType) {
+  return ArithmeticType();
+}
+
 //! Real part of a complex number.
 template <class RealType>
 KOKKOS_INLINE_FUNCTION RealType real(const complex<RealType>& x) noexcept {
   return x.real();
 }
 
+template <class ArithmeticType>
+KOKKOS_INLINE_FUNCTION constexpr Impl::promote_t<ArithmeticType> real(
+    ArithmeticType x) {
+  return x;
+}
+
 //! Constructs a complex number from magnitude and phase angle
 template <class T>
 KOKKOS_INLINE_FUNCTION complex<T> polar(const T& r, const T& theta = T()) {
@@ -733,36 +742,6 @@ KOKKOS_INLINE_FUNCTION complex<T> pow(const complex<T>& x,
   return x == T() ? T() : exp(y * log(x));
 }
 
-namespace Impl {
-// NOTE promote would also be useful for math functions
-template <class T, bool = std::is_integral<T>::value>
-struct promote {
-  using type = double;
-};
-template <class T>
-struct promote<T, false> {};
-template <>
-struct promote<long double> {
-  using type = long double;
-};
-template <>
-struct promote<double> {
-  using type = double;
-};
-template <>
-struct promote<float> {
-  using type = float;
-};
-template <class T>
-using promote_t = typename promote<T>::type;
-template <class T, class U>
-struct promote_2 {
-  using type = decltype(promote_t<T>() + promote_t<U>());
-};
-template <class T, class U>
-using promote_2_t = typename promote_2<T, U>::type;
-}  // namespace Impl
-
 template <class T, class U,
           class = std::enable_if_t<std::is_arithmetic<T>::value>>
 KOKKOS_INLINE_FUNCTION complex<Impl::promote_2_t<T, U>> pow(
@@ -816,6 +795,13 @@ KOKKOS_INLINE_FUNCTION complex<RealType> conj(
   return complex<RealType>(real(x), -imag(x));
 }
 
+template <class ArithmeticType>
+KOKKOS_INLINE_FUNCTION constexpr complex<Impl::promote_t<ArithmeticType>> conj(
+    ArithmeticType x) {
+  using type = Impl::promote_t<ArithmeticType>;
+  return complex<type>(x, -type());
+}
+
 //! Exponential of a complex number.
 template <class RealType>
 KOKKOS_INLINE_FUNCTION complex<RealType> exp(const complex<RealType>& x) {
diff --git a/packages/kokkos/core/src/Kokkos_Concepts.hpp b/packages/kokkos/core/src/Kokkos_Concepts.hpp
index 2aba189487490d4f870cec407ec1d1f3b9ed001e..97137387f264a869dacb6b3b5abdd7fa5a9ba5ff 100644
--- a/packages/kokkos/core/src/Kokkos_Concepts.hpp
+++ b/packages/kokkos/core/src/Kokkos_Concepts.hpp
@@ -180,20 +180,23 @@ KOKKOS_IMPL_IS_CONCEPT(work_item_property)
 
 namespace Impl {
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 // For backward compatibility:
 
-using Kokkos::is_array_layout;
-using Kokkos::is_execution_policy;
-using Kokkos::is_execution_space;
-using Kokkos::is_memory_space;
-using Kokkos::is_memory_traits;
+template <typename T>
+using is_array_layout KOKKOS_DEPRECATED = Kokkos::is_array_layout<T>;
+template <typename T>
+using is_execution_policy KOKKOS_DEPRECATED = Kokkos::is_execution_policy<T>;
+template <typename T>
+using is_execution_space KOKKOS_DEPRECATED = Kokkos::is_execution_space<T>;
+template <typename T>
+using is_memory_space KOKKOS_DEPRECATED = Kokkos::is_memory_space<T>;
+template <typename T>
+using is_memory_traits KOKKOS_DEPRECATED = Kokkos::is_memory_traits<T>;
+#endif
 
 // Implementation concept:
 
-KOKKOS_IMPL_IS_CONCEPT(iteration_pattern)
-KOKKOS_IMPL_IS_CONCEPT(schedule_type)
-KOKKOS_IMPL_IS_CONCEPT(index_type)
-KOKKOS_IMPL_IS_CONCEPT(launch_bounds)
 KOKKOS_IMPL_IS_CONCEPT(thread_team_member)
 KOKKOS_IMPL_IS_CONCEPT(host_thread_team_member)
 KOKKOS_IMPL_IS_CONCEPT(graph_kernel)
@@ -330,42 +333,65 @@ struct is_space {
   // For backward compatibility, deprecated in favor of
   // Kokkos::Impl::HostMirror<S>::host_mirror_space
 
-  using host_memory_space = typename std::conditional<
+ private:
+  // The actual definitions for host_memory_space and host_execution_spaces are
+  // in do_not_use_host_memory_space and do_not_use_host_execution_space to be
+  // able to use them within this class without deprecation warnings.
+  using do_not_use_host_memory_space = std::conditional_t<
       std::is_same<memory_space, Kokkos::HostSpace>::value
 #if defined(KOKKOS_ENABLE_CUDA)
           || std::is_same<memory_space, Kokkos::CudaUVMSpace>::value ||
           std::is_same<memory_space, Kokkos::CudaHostPinnedSpace>::value
-#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */
+#elif defined(KOKKOS_ENABLE_HIP)
+          || std::is_same<memory_space,
+                          Kokkos::Experimental::HIPHostPinnedSpace>::value
+#elif defined(KOKKOS_ENABLE_SYCL)
+          || std::is_same<memory_space,
+                          Kokkos::Experimental::SYCLSharedUSMSpace>::value ||
+          std::is_same<memory_space,
+                       Kokkos::Experimental::SYCLHostUSMSpace>::value
+#endif
       ,
-      memory_space, Kokkos::HostSpace>::type;
+      memory_space, Kokkos::HostSpace>;
 
+  using do_not_use_host_execution_space = std::conditional_t<
 #if defined(KOKKOS_ENABLE_CUDA)
-  using host_execution_space = typename std::conditional<
-      std::is_same<execution_space, Kokkos::Cuda>::value,
-      Kokkos::DefaultHostExecutionSpace, execution_space>::type;
-#else
-#if defined(KOKKOS_ENABLE_OPENMPTARGET)
-  using host_execution_space = typename std::conditional<
-      std::is_same<execution_space, Kokkos::Experimental::OpenMPTarget>::value,
-      Kokkos::DefaultHostExecutionSpace, execution_space>::type;
-#else
-  using host_execution_space = execution_space;
-#endif
+      std::is_same<execution_space, Kokkos::Cuda>::value ||
+#elif defined(KOKKOS_ENABLE_HIP)
+      std::is_same<execution_space, Kokkos::Experimental::HIP>::value ||
+#elif defined(KOKKOS_ENABLE_SYCL)
+      std::is_same<execution_space, Kokkos::Experimental::SYCL>::value ||
+#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
+      std::is_same<execution_space,
+                   Kokkos::Experimental::OpenMPTarget>::value ||
 #endif
+          false,
+      Kokkos::DefaultHostExecutionSpace, execution_space>;
 
-  using host_mirror_space = typename std::conditional<
-      std::is_same<execution_space, host_execution_space>::value &&
-          std::is_same<memory_space, host_memory_space>::value,
-      T, Kokkos::Device<host_execution_space, host_memory_space>>::type;
+ public:
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+  using host_memory_space KOKKOS_DEPRECATED = do_not_use_host_memory_space;
+  using host_execution_space KOKKOS_DEPRECATED =
+      do_not_use_host_execution_space;
+  using host_mirror_space KOKKOS_DEPRECATED = std::conditional_t<
+      std::is_same<execution_space, do_not_use_host_execution_space>::value &&
+          std::is_same<memory_space, do_not_use_host_memory_space>::value,
+      T,
+      Kokkos::Device<do_not_use_host_execution_space,
+                     do_not_use_host_memory_space>>;
+#endif
 };
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 // For backward compatibility
 
 namespace Impl {
 
-using Kokkos::is_space;
+template <typename T>
+using is_space KOKKOS_DEPRECATED = Kokkos::is_space<T>;
 
 }
+#endif
 
 }  // namespace Kokkos
 
@@ -485,13 +511,18 @@ struct SpaceAccessibility {
 
 }  // namespace Kokkos
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 namespace Kokkos {
 namespace Impl {
 
-using Kokkos::SpaceAccessibility;  // For backward compatibility
+// For backward compatibility
+template <typename AccessSpace, typename MemorySpace>
+using SpaceAccessibility KOKKOS_DEPRECATED =
+    Kokkos::SpaceAccessibility<AccessSpace, MemorySpace>;
 
-}
+}  // namespace Impl
 }  // namespace Kokkos
+#endif
 
 //----------------------------------------------------------------------------
 
diff --git a/packages/kokkos/core/src/Kokkos_CopyViews.hpp b/packages/kokkos/core/src/Kokkos_CopyViews.hpp
index a27d5f0e47284f7d06b3d9218d1f02bfb679468e..16946dd602b536793f8746165ebd4bb82631742b 100644
--- a/packages/kokkos/core/src/Kokkos_CopyViews.hpp
+++ b/packages/kokkos/core/src/Kokkos_CopyViews.hpp
@@ -47,6 +47,7 @@
 #include <string>
 #include <Kokkos_Parallel.hpp>
 #include <KokkosExp_MDRangePolicy.hpp>
+#include <Kokkos_Layout.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -544,13 +545,11 @@ void view_copy(const ExecutionSpace& space, const DstType& dst,
 
   enum {
     ExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<ExecutionSpace,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<ExecutionSpace, src_memory_space>::accessible
   };
   enum {
     ExecCanAccessDst =
-        Kokkos::Impl::SpaceAccessibility<ExecutionSpace,
-                                         dst_memory_space>::accessible
+        Kokkos::SpaceAccessibility<ExecutionSpace, dst_memory_space>::accessible
   };
 
   if (!(ExecCanAccessSrc && ExecCanAccessDst)) {
@@ -624,14 +623,14 @@ void view_copy(const DstType& dst, const SrcType& src) {
 
   enum {
     DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
   };
 
   enum {
     SrcExecCanAccessDst =
-        Kokkos::Impl::SpaceAccessibility<src_execution_space,
-                                         dst_memory_space>::accessible
+        Kokkos::SpaceAccessibility<src_execution_space,
+                                   dst_memory_space>::accessible
   };
 
   if (!DstExecCanAccessSrc && !SrcExecCanAccessDst) {
@@ -1254,6 +1253,98 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 8> {
   }
 };
 
+template <typename ExecutionSpace, class DT, class... DP>
+inline void contiguous_fill(
+    const ExecutionSpace& exec_space, const View<DT, DP...>& dst,
+    typename ViewTraits<DT, DP...>::const_value_type& value) {
+  using ViewType     = View<DT, DP...>;
+  using ViewTypeFlat = Kokkos::View<
+      typename ViewType::value_type*, Kokkos::LayoutRight,
+      Kokkos::Device<typename ViewType::execution_space,
+                     typename std::conditional<ViewType::Rank == 0,
+                                               typename ViewType::memory_space,
+                                               Kokkos::AnonymousSpace>::type>,
+      Kokkos::MemoryTraits<0>>;
+
+  ViewTypeFlat dst_flat(dst.data(), dst.size());
+  if (dst.span() < static_cast<size_t>(std::numeric_limits<int>::max())) {
+    Kokkos::Impl::ViewFill<ViewTypeFlat, Kokkos::LayoutRight, ExecutionSpace,
+                           ViewTypeFlat::Rank, int>(dst_flat, value,
+                                                    exec_space);
+  } else
+    Kokkos::Impl::ViewFill<ViewTypeFlat, Kokkos::LayoutRight, ExecutionSpace,
+                           ViewTypeFlat::Rank, int64_t>(dst_flat, value,
+                                                        exec_space);
+}
+
+template <typename ExecutionSpace, class DT, class... DP>
+struct ZeroMemset {
+  ZeroMemset(const ExecutionSpace& exec_space, const View<DT, DP...>& dst,
+             typename ViewTraits<DT, DP...>::const_value_type& value) {
+    contiguous_fill(exec_space, dst, value);
+  }
+
+  ZeroMemset(const View<DT, DP...>& dst,
+             typename ViewTraits<DT, DP...>::const_value_type& value) {
+    contiguous_fill(ExecutionSpace(), dst, value);
+  }
+};
+
+template <typename ExecutionSpace, class DT, class... DP>
+inline std::enable_if_t<
+    std::is_trivial<typename ViewTraits<DT, DP...>::const_value_type>::value &&
+    std::is_trivially_copy_assignable<
+        typename ViewTraits<DT, DP...>::const_value_type>::value>
+contiguous_fill_or_memset(
+    const ExecutionSpace& exec_space, const View<DT, DP...>& dst,
+    typename ViewTraits<DT, DP...>::const_value_type& value) {
+  if (Impl::is_zero_byte(value))
+    ZeroMemset<ExecutionSpace, DT, DP...>(exec_space, dst, value);
+  else
+    contiguous_fill(exec_space, dst, value);
+}
+
+template <typename ExecutionSpace, class DT, class... DP>
+inline std::enable_if_t<!(
+    std::is_trivial<typename ViewTraits<DT, DP...>::const_value_type>::value &&
+    std::is_trivially_copy_assignable<
+        typename ViewTraits<DT, DP...>::const_value_type>::value)>
+contiguous_fill_or_memset(
+    const ExecutionSpace& exec_space, const View<DT, DP...>& dst,
+    typename ViewTraits<DT, DP...>::const_value_type& value) {
+  contiguous_fill(exec_space, dst, value);
+}
+
+template <class DT, class... DP>
+inline std::enable_if_t<
+    std::is_trivial<typename ViewTraits<DT, DP...>::const_value_type>::value &&
+    std::is_trivially_copy_assignable<
+        typename ViewTraits<DT, DP...>::const_value_type>::value>
+contiguous_fill_or_memset(
+    const View<DT, DP...>& dst,
+    typename ViewTraits<DT, DP...>::const_value_type& value) {
+  using ViewType        = View<DT, DP...>;
+  using exec_space_type = typename ViewType::execution_space;
+
+  if (Impl::is_zero_byte(value))
+    ZeroMemset<exec_space_type, DT, DP...>(dst, value);
+  else
+    contiguous_fill(exec_space_type(), dst, value);
+}
+
+template <class DT, class... DP>
+inline std::enable_if_t<!(
+    std::is_trivial<typename ViewTraits<DT, DP...>::const_value_type>::value &&
+    std::is_trivially_copy_assignable<
+        typename ViewTraits<DT, DP...>::const_value_type>::value)>
+contiguous_fill_or_memset(
+    const View<DT, DP...>& dst,
+    typename ViewTraits<DT, DP...>::const_value_type& value) {
+  using ViewType        = View<DT, DP...>;
+  using exec_space_type = typename ViewType::execution_space;
+
+  contiguous_fill(exec_space_type(), dst, value);
+}
 }  // namespace Impl
 
 /** \brief  Deep copy a value from Host memory into a view.  */
@@ -1276,38 +1367,23 @@ inline void deep_copy(
   }
 
   if (dst.data() == nullptr) {
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: scalar copy, fence because destination is null");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
     return;
   }
 
-  Kokkos::fence();
+  Kokkos::fence("Kokkos::deep_copy: scalar copy, pre copy fence");
   static_assert(std::is_same<typename ViewType::non_const_value_type,
                              typename ViewType::value_type>::value,
                 "deep_copy requires non-const type");
 
-  // If contiguous we can simply do a 1D flat loop
+  // If contiguous we can simply do a 1D flat loop or use memset
   if (dst.span_is_contiguous()) {
-    using ViewTypeFlat = Kokkos::View<
-        typename ViewType::value_type*, Kokkos::LayoutRight,
-        Kokkos::Device<typename ViewType::execution_space,
-                       typename std::conditional<
-                           ViewType::Rank == 0, typename ViewType::memory_space,
-                           Kokkos::AnonymousSpace>::type>,
-        Kokkos::MemoryTraits<0>>;
-
-    ViewTypeFlat dst_flat(dst.data(), dst.size());
-    if (dst.span() < static_cast<size_t>(std::numeric_limits<int>::max())) {
-      Kokkos::Impl::ViewFill<ViewTypeFlat, Kokkos::LayoutRight, exec_space_type,
-                             ViewTypeFlat::Rank, int>(dst_flat, value,
-                                                      exec_space_type());
-    } else
-      Kokkos::Impl::ViewFill<ViewTypeFlat, Kokkos::LayoutRight, exec_space_type,
-                             ViewTypeFlat::Rank, int64_t>(dst_flat, value,
-                                                          exec_space_type());
-    Kokkos::fence();
+    Impl::contiguous_fill_or_memset(dst, value);
+    Kokkos::fence("Kokkos::deep_copy: scalar copy, post copy fence");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
@@ -1362,7 +1438,7 @@ inline void deep_copy(
                              exec_space_type, ViewType::Rank, int>(
           dst, value, exec_space_type());
   }
-  Kokkos::fence();
+  Kokkos::fence("Kokkos::deep_copy: scalar copy, post copy fence");
 
   if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
     Kokkos::Profiling::endDeepCopy();
@@ -1393,7 +1469,7 @@ inline void deep_copy(
   }
 
   if (src.data() == nullptr) {
-    Kokkos::fence();
+    Kokkos::fence("Kokkos::deep_copy: copy into scalar, src is null");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
@@ -1439,18 +1515,19 @@ inline void deep_copy(
   }
 
   if (dst.data() == nullptr && src.data() == nullptr) {
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: scalar to scalar copy, both pointers null");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
     return;
   }
 
-  Kokkos::fence();
+  Kokkos::fence("Kokkos::deep_copy: scalar to scalar copy, pre copy fence");
   if (dst.data() != src.data()) {
     Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
         dst.data(), src.data(), sizeof(value_type));
-    Kokkos::fence();
+    Kokkos::fence("Kokkos::deep_copy: scalar to scalar copy, post copy fence");
   }
   if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
     Kokkos::Profiling::endDeepCopy();
@@ -1522,7 +1599,9 @@ inline void deep_copy(
 
       Kokkos::Impl::throw_runtime_exception(message);
     }
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: copy between contiguous views, fence due to null "
+        "argument");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
@@ -1531,14 +1610,14 @@ inline void deep_copy(
 
   enum {
     DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
   };
 
   enum {
     SrcExecCanAccessDst =
-        Kokkos::Impl::SpaceAccessibility<src_execution_space,
-                                         dst_memory_space>::accessible
+        Kokkos::SpaceAccessibility<src_execution_space,
+                                   dst_memory_space>::accessible
   };
 
   // Checking for Overlapping Views.
@@ -1549,7 +1628,9 @@ inline void deep_copy(
   if (((std::ptrdiff_t)dst_start == (std::ptrdiff_t)src_start) &&
       ((std::ptrdiff_t)dst_end == (std::ptrdiff_t)src_end) &&
       (dst.span_is_contiguous() && src.span_is_contiguous())) {
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: copy between contiguous views, fence due to same "
+        "spans");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
@@ -1620,16 +1701,22 @@ inline void deep_copy(
       ((dst_type::rank < 7) || (dst.stride_6() == src.stride_6())) &&
       ((dst_type::rank < 8) || (dst.stride_7() == src.stride_7()))) {
     const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: copy between contiguous views, pre view equality "
+        "check");
     if ((void*)dst.data() != (void*)src.data()) {
       Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
           dst.data(), src.data(), nbytes);
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::deep_copy: copy between contiguous views, post deep copy "
+          "fence");
     }
   } else {
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: copy between contiguous views, pre copy fence");
     Impl::view_copy(dst, src);
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: copy between contiguous views, post copy fence");
   }
   if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
     Kokkos::Profiling::endDeepCopy();
@@ -2418,9 +2505,9 @@ inline void deep_copy(
     const ExecSpace& space, const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
     typename std::enable_if<
-        Kokkos::Impl::is_execution_space<ExecSpace>::value &&
+        Kokkos::is_execution_space<ExecSpace>::value &&
         std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value &&
-        Kokkos::Impl::SpaceAccessibility<
+        Kokkos::SpaceAccessibility<
             ExecSpace,
             typename ViewTraits<DT, DP...>::memory_space>::accessible>::type* =
         nullptr) {
@@ -2437,7 +2524,9 @@ inline void deep_copy(
         "(none)", &value, dst.span() * sizeof(typename dst_traits::value_type));
   }
   if (dst.data() == nullptr) {
-    space.fence();
+    space.fence("Kokkos::deep_copy: scalar copy on space, dst data is null");
+  } else if (dst.span_is_contiguous()) {
+    Impl::contiguous_fill_or_memset(space, dst, value);
   } else {
     using ViewTypeUniform = typename std::conditional<
         View<DT, DP...>::Rank == 0,
@@ -2458,9 +2547,9 @@ inline void deep_copy(
     const ExecSpace& space, const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
     typename std::enable_if<
-        Kokkos::Impl::is_execution_space<ExecSpace>::value &&
+        Kokkos::is_execution_space<ExecSpace>::value &&
         std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value &&
-        !Kokkos::Impl::SpaceAccessibility<
+        !Kokkos::SpaceAccessibility<
             ExecSpace,
             typename ViewTraits<DT, DP...>::memory_space>::accessible>::type* =
         nullptr) {
@@ -2477,17 +2566,23 @@ inline void deep_copy(
         "(none)", &value, dst.span() * sizeof(typename dst_traits::value_type));
   }
   if (dst.data() == nullptr) {
-    space.fence();
+    space.fence(
+        "Kokkos::deep_copy: scalar-to-view copy on space, dst data is null");
   } else {
-    space.fence();
-    using ViewTypeUniform = typename std::conditional<
-        View<DT, DP...>::Rank == 0,
-        typename View<DT, DP...>::uniform_runtime_type,
-        typename View<DT, DP...>::uniform_runtime_nomemspace_type>::type;
+    space.fence("Kokkos::deep_copy: scalar-to-view copy on space, pre copy");
     using fill_exec_space = typename dst_traits::memory_space::execution_space;
-    Kokkos::Impl::ViewFill<ViewTypeUniform, typename dst_traits::array_layout,
-                           fill_exec_space>(dst, value, fill_exec_space());
-    fill_exec_space().fence();
+    if (dst.span_is_contiguous()) {
+      Impl::contiguous_fill_or_memset(fill_exec_space(), dst, value);
+    } else {
+      using ViewTypeUniform = typename std::conditional<
+          View<DT, DP...>::Rank == 0,
+          typename View<DT, DP...>::uniform_runtime_type,
+          typename View<DT, DP...>::uniform_runtime_nomemspace_type>::type;
+      Kokkos::Impl::ViewFill<ViewTypeUniform, typename dst_traits::array_layout,
+                             fill_exec_space>(dst, value, fill_exec_space());
+    }
+    fill_exec_space().fence(
+        "Kokkos::deep_copy: scalar-to-view copy on space, fence after fill");
   }
   if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
     Kokkos::Profiling::endDeepCopy();
@@ -2501,7 +2596,7 @@ inline void deep_copy(
     typename ViewTraits<ST, SP...>::non_const_value_type& dst,
     const View<ST, SP...>& src,
     typename std::enable_if<
-        Kokkos::Impl::is_execution_space<ExecSpace>::value &&
+        Kokkos::is_execution_space<ExecSpace>::value &&
         std::is_same<typename ViewTraits<ST, SP...>::specialize,
                      void>::value>::type* = nullptr) {
   using src_traits       = ViewTraits<ST, SP...>;
@@ -2517,7 +2612,8 @@ inline void deep_copy(
   }
 
   if (src.data() == nullptr) {
-    exec_space.fence();
+    exec_space.fence(
+        "Kokkos::deep_copy: view-to-scalar copy on space, src data is null");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
@@ -2538,7 +2634,7 @@ inline void deep_copy(
     const ExecSpace& exec_space, const View<DT, DP...>& dst,
     const View<ST, SP...>& src,
     typename std::enable_if<(
-        Kokkos::Impl::is_execution_space<ExecSpace>::value &&
+        Kokkos::is_execution_space<ExecSpace>::value &&
         std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value &&
         std::is_same<typename ViewTraits<ST, SP...>::specialize, void>::value &&
         (unsigned(ViewTraits<DT, DP...>::rank) == unsigned(0) &&
@@ -2562,7 +2658,8 @@ inline void deep_copy(
   }
 
   if (dst.data() == nullptr && src.data() == nullptr) {
-    exec_space.fence();
+    exec_space.fence(
+        "Kokkos::deep_copy: view-to-view copy on space, data is null");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
@@ -2588,7 +2685,7 @@ inline void deep_copy(
     const ExecSpace& exec_space, const View<DT, DP...>& dst,
     const View<ST, SP...>& src,
     typename std::enable_if<(
-        Kokkos::Impl::is_execution_space<ExecSpace>::value &&
+        Kokkos::is_execution_space<ExecSpace>::value &&
         std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value &&
         std::is_same<typename ViewTraits<ST, SP...>::specialize, void>::value &&
         (unsigned(ViewTraits<DT, DP...>::rank) != 0 ||
@@ -2662,21 +2759,19 @@ inline void deep_copy(
 
   enum {
     ExecCanAccessSrcDst =
-        Kokkos::Impl::SpaceAccessibility<ExecSpace,
-                                         dst_memory_space>::accessible &&
-        Kokkos::Impl::SpaceAccessibility<ExecSpace,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<ExecSpace, dst_memory_space>::accessible &&
+        Kokkos::SpaceAccessibility<ExecSpace, src_memory_space>::accessible
   };
   enum {
     DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
   };
 
   enum {
     SrcExecCanAccessDst =
-        Kokkos::Impl::SpaceAccessibility<src_execution_space,
-                                         dst_memory_space>::accessible
+        Kokkos::SpaceAccessibility<src_execution_space,
+                                   dst_memory_space>::accessible
   };
 
   // Error out for non-identical overlapping views.
@@ -2757,9 +2852,13 @@ inline void deep_copy(
       using cpy_exec_space =
           typename std::conditional<DstExecCanAccessSrc, dst_execution_space,
                                     src_execution_space>::type;
-      exec_space.fence();
+      exec_space.fence(
+          "Kokkos::deep_copy: view-to-view noncontiguous copy on space, pre "
+          "copy");
       Impl::view_copy(cpy_exec_space(), dst, src);
-      cpy_exec_space().fence();
+      cpy_exec_space().fence(
+          "Kokkos::deep_copy: view-to-view noncontiguous copy on space, post "
+          "copy");
     } else {
       Kokkos::Impl::throw_runtime_exception(
           "deep_copy given views that would require a temporary allocation");
@@ -2777,6 +2876,19 @@ inline void deep_copy(
 
 namespace Kokkos {
 
+namespace Impl {
+template <typename ViewType>
+bool size_mismatch(const ViewType& view, unsigned int max_extent,
+                   const size_t new_extents[8]) {
+  for (unsigned int dim = 0; dim < max_extent; ++dim)
+    if (new_extents[dim] != view.extent(dim)) {
+      return true;
+    }
+  return false;
+}
+
+}  // namespace Impl
+
 /** \brief  Resize a view with copying old data to new data at the corresponding
  * indices. */
 template <class T, class... P>
@@ -2798,67 +2910,6 @@ resize(Kokkos::View<T, P...>& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
   static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
                 "Can only resize managed views");
 
-  // Fix #904 by checking dimensions before actually resizing.
-  //
-  // Rank is known at compile time, so hopefully the compiler will
-  // remove branches that are compile-time false.  The upcoming "if
-  // constexpr" language feature would make this certain.
-  if (view_type::Rank == 1 && n0 == static_cast<size_t>(v.extent(0))) {
-    return;
-  }
-  if (view_type::Rank == 2 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1))) {
-    return;
-  }
-  if (view_type::Rank == 3 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2))) {
-    return;
-  }
-  if (view_type::Rank == 4 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3))) {
-    return;
-  }
-  if (view_type::Rank == 5 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4))) {
-    return;
-  }
-  if (view_type::Rank == 6 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4)) &&
-      n5 == static_cast<size_t>(v.extent(5))) {
-    return;
-  }
-  if (view_type::Rank == 7 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4)) &&
-      n5 == static_cast<size_t>(v.extent(5)) &&
-      n6 == static_cast<size_t>(v.extent(6))) {
-    return;
-  }
-  if (view_type::Rank == 8 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4)) &&
-      n5 == static_cast<size_t>(v.extent(5)) &&
-      n6 == static_cast<size_t>(v.extent(6)) &&
-      n7 == static_cast<size_t>(v.extent(7))) {
-    return;
-  }
-  // If Kokkos ever supports Views of rank > 8, the above code won't
-  // be incorrect, because avoiding reallocation in resize() is just
-  // an optimization.
-
   // TODO (mfh 27 Jun 2017) If the old View has enough space but just
   // different dimensions (e.g., if the product of the dimensions,
   // including extra space for alignment, will not change), then
@@ -2866,11 +2917,17 @@ resize(Kokkos::View<T, P...>& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
   // reallocates if any of the dimensions change, even if the old View
   // has enough space.
 
-  view_type v_resized(v.label(), n0, n1, n2, n3, n4, n5, n6, n7);
+  const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
+  const bool sizeMismatch = Impl::size_mismatch(v, v.rank_dynamic, new_extents);
 
-  Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
+  if (sizeMismatch) {
+    view_type v_resized(v.label(), n0, n1, n2, n3, n4, n5, n6, n7);
 
-  v = v_resized;
+    Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
+    Kokkos::fence("Kokkos::resize(View)");
+
+    v = v_resized;
+  }
 }
 
 /** \brief  Resize a view with copying old data to new data at the corresponding
@@ -2895,67 +2952,6 @@ resize(const I& arg_prop, Kokkos::View<T, P...>& v,
   static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
                 "Can only resize managed views");
 
-  // Fix #904 by checking dimensions before actually resizing.
-  //
-  // Rank is known at compile time, so hopefully the compiler will
-  // remove branches that are compile-time false.  The upcoming "if
-  // constexpr" language feature would make this certain.
-  if (view_type::Rank == 1 && n0 == static_cast<size_t>(v.extent(0))) {
-    return;
-  }
-  if (view_type::Rank == 2 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1))) {
-    return;
-  }
-  if (view_type::Rank == 3 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2))) {
-    return;
-  }
-  if (view_type::Rank == 4 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3))) {
-    return;
-  }
-  if (view_type::Rank == 5 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4))) {
-    return;
-  }
-  if (view_type::Rank == 6 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4)) &&
-      n5 == static_cast<size_t>(v.extent(5))) {
-    return;
-  }
-  if (view_type::Rank == 7 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4)) &&
-      n5 == static_cast<size_t>(v.extent(5)) &&
-      n6 == static_cast<size_t>(v.extent(6))) {
-    return;
-  }
-  if (view_type::Rank == 8 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4)) &&
-      n5 == static_cast<size_t>(v.extent(5)) &&
-      n6 == static_cast<size_t>(v.extent(6)) &&
-      n7 == static_cast<size_t>(v.extent(7))) {
-    return;
-  }
-  // If Kokkos ever supports Views of rank > 8, the above code won't
-  // be incorrect, because avoiding reallocation in resize() is just
-  // an optimization.
-
   // TODO (mfh 27 Jun 2017) If the old View has enough space but just
   // different dimensions (e.g., if the product of the dimensions,
   // including extra space for alignment, will not change), then
@@ -2963,19 +2959,64 @@ resize(const I& arg_prop, Kokkos::View<T, P...>& v,
   // reallocates if any of the dimensions change, even if the old View
   // has enough space.
 
-  view_type v_resized(view_alloc(v.label(), std::forward<const I>(arg_prop)),
-                      n0, n1, n2, n3, n4, n5, n6, n7);
+  const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
+  const bool sizeMismatch = Impl::size_mismatch(v, v.rank_dynamic, new_extents);
 
-  Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
+  if (sizeMismatch) {
+    view_type v_resized(view_alloc(v.label(), std::forward<const I>(arg_prop)),
+                        n0, n1, n2, n3, n4, n5, n6, n7);
 
-  v = v_resized;
+    Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
+    // This fence really ought to look for an execution space in
+    // arg_prop, and just fence that if there is one
+    Kokkos::fence("Kokkos::resize(View)");
+
+    v = v_resized;
+  }
 }
 
 /** \brief  Resize a view with copying old data to new data at the corresponding
  * indices. */
 template <class T, class... P>
-inline void resize(Kokkos::View<T, P...>& v,
-                   const typename Kokkos::View<T, P...>::array_layout& layout) {
+inline std::enable_if_t<
+    std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                 Kokkos::LayoutLeft>::value ||
+    std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                 Kokkos::LayoutRight>::value ||
+    std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                 Kokkos::LayoutStride>::value ||
+    is_layouttiled<typename Kokkos::View<T, P...>::array_layout>::value>
+resize(Kokkos::View<T, P...>& v,
+       const typename Kokkos::View<T, P...>::array_layout& layout) {
+  using view_type = Kokkos::View<T, P...>;
+
+  static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
+                "Can only resize managed views");
+
+  if (v.layout() != layout) {
+    view_type v_resized(v.label(), layout);
+
+    Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
+    Kokkos::fence("Kokkos::resize(View)");
+
+    v = v_resized;
+  }
+}
+
+// FIXME User-provided (custom) layouts are not required to have a comparison
+// operator. Hence, there is no way to check if the requested layout is actually
+// the same as the existing one.
+template <class T, class... P>
+inline std::enable_if_t<
+    !(std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                   Kokkos::LayoutLeft>::value ||
+      std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                   Kokkos::LayoutRight>::value ||
+      std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                   Kokkos::LayoutStride>::value ||
+      is_layouttiled<typename Kokkos::View<T, P...>::array_layout>::value)>
+resize(Kokkos::View<T, P...>& v,
+       const typename Kokkos::View<T, P...>::array_layout& layout) {
   using view_type = Kokkos::View<T, P...>;
 
   static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
@@ -3009,10 +3050,16 @@ realloc(Kokkos::View<T, P...>& v,
   static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
                 "Can only realloc managed views");
 
-  const std::string label = v.label();
+  const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
+  const bool sizeMismatch = Impl::size_mismatch(v, v.rank_dynamic, new_extents);
 
-  v = view_type();  // Deallocate first, if the only view to allocation
-  v = view_type(label, n0, n1, n2, n3, n4, n5, n6, n7);
+  if (sizeMismatch) {
+    const std::string label = v.label();
+
+    v = view_type();  // Deallocate first, if the only view to allocation
+    v = view_type(label, n0, n1, n2, n3, n4, n5, n6, n7);
+  } else
+    Kokkos::deep_copy(v, typename view_type::value_type{});
 }
 
 /** \brief  Resize a view with discarding old data. */
@@ -3209,7 +3256,8 @@ create_mirror_view_and_copy(
         Impl::MirrorViewType<Space, T, P...>::is_same_memspace>::type* =
         nullptr) {
   (void)name;
-  fence();  // same behavior as deep_copy(src, src)
+  fence(
+      "Kokkos::create_mirror_view_and_copy: fence before returning src view");  // same behavior as deep_copy(src, src)
   return src;
 }
 
diff --git a/packages/kokkos/core/src/Kokkos_Core.hpp b/packages/kokkos/core/src/Kokkos_Core.hpp
index c3771ab393f3aaf8f77cb474056d90e867ff03da..60e748589df593dbb9e549f6433daea77b5bc6b0 100644
--- a/packages/kokkos/core/src/Kokkos_Core.hpp
+++ b/packages/kokkos/core/src/Kokkos_Core.hpp
@@ -59,6 +59,7 @@
 #include <Kokkos_LogicalSpaces.hpp>
 #include <Kokkos_Pair.hpp>
 #include <Kokkos_MathematicalFunctions.hpp>
+#include <Kokkos_MathematicalSpecialFunctions.hpp>
 #include <Kokkos_MemoryPool.hpp>
 #include <Kokkos_Array.hpp>
 #include <Kokkos_View.hpp>
@@ -74,6 +75,7 @@
 #include <iosfwd>
 #include <map>
 #include <memory>
+#include <vector>
 
 //----------------------------------------------------------------------------
 
@@ -121,6 +123,7 @@ class ExecSpaceManager {
   void initialize_spaces(const Kokkos::InitArguments& args);
   void finalize_spaces(const bool all_spaces);
   void static_fence();
+  void static_fence(const std::string&);
   void print_configuration(std::ostream& msg, const bool detail);
   static ExecSpaceManager& get_instance();
 };
@@ -184,6 +187,7 @@ void push_finalize_hook(std::function<void()> f);
 void finalize_all();
 
 void fence();
+void fence(const std::string&);
 
 /** \brief Print "Bill of Materials" */
 void print_configuration(std::ostream&, const bool detail = false);
@@ -274,6 +278,44 @@ class ScopeGuard {
 
 }  // namespace Kokkos
 
+namespace Kokkos {
+namespace Experimental {
+// Partitioning an Execution Space: expects space and integer arguments for
+// relative weight
+//   Customization point for backends
+//   Default behavior is to return the passed in instance
+template <class ExecSpace, class... Args>
+std::vector<ExecSpace> partition_space(ExecSpace space, Args...) {
+  static_assert(is_execution_space<ExecSpace>::value,
+                "Kokkos Error: partition_space expects an Execution Space as "
+                "first argument");
+#ifdef __cpp_fold_expressions
+  static_assert(
+      (... && std::is_arithmetic_v<Args>),
+      "Kokkos Error: partitioning arguments must be integers or floats");
+#endif
+  std::vector<ExecSpace> instances(sizeof...(Args));
+  for (int s = 0; s < int(sizeof...(Args)); s++) instances[s] = space;
+  return instances;
+}
+
+template <class ExecSpace, class T>
+std::vector<ExecSpace> partition_space(ExecSpace space,
+                                       std::vector<T>& weights) {
+  static_assert(is_execution_space<ExecSpace>::value,
+                "Kokkos Error: partition_space expects an Execution Space as "
+                "first argument");
+  static_assert(
+      std::is_arithmetic<T>::value,
+      "Kokkos Error: partitioning arguments must be integers or floats");
+
+  std::vector<ExecSpace> instances(weights.size());
+  for (int s = 0; s < int(weights.size()); s++) instances[s] = space;
+  return instances;
+}
+}  // namespace Experimental
+}  // namespace Kokkos
+
 #include <Kokkos_Crs.hpp>
 #include <Kokkos_WorkGraphPolicy.hpp>
 // Including this in Kokkos_Parallel_Reduce.hpp led to a circular dependency
diff --git a/packages/kokkos/core/src/Kokkos_Core_fwd.hpp b/packages/kokkos/core/src/Kokkos_Core_fwd.hpp
index fe7eba3f6ef178848d2ea832341014d6dc5d1003..a610ee76dffb6fd23fadae40437b893eaab5cc87 100644
--- a/packages/kokkos/core/src/Kokkos_Core_fwd.hpp
+++ b/packages/kokkos/core/src/Kokkos_Core_fwd.hpp
@@ -53,7 +53,9 @@
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_Utilities.hpp>
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 #include <Kokkos_MasterLock.hpp>
+#endif
 
 //----------------------------------------------------------------------------
 // Have assumed a 64bit build (8byte pointers) throughout the code base.
@@ -238,7 +240,8 @@ class LogicalMemorySpace;
 
 namespace Kokkos {
 void fence();
-}
+void fence(const std::string &);
+}  // namespace Kokkos
 
 //----------------------------------------------------------------------------
 
@@ -250,9 +253,13 @@ class View;
 namespace Impl {
 
 template <class DstSpace, class SrcSpace,
-          class ExecutionSpace = typename DstSpace::execution_space>
+          class ExecutionSpace = typename DstSpace::execution_space,
+          class Enable         = void>
 struct DeepCopy;
 
+template <typename ExecutionSpace, class DT, class... DP>
+struct ZeroMemset;
+
 template <class ViewType, class Layout = typename ViewType::array_layout,
           class ExecSpace = typename ViewType::execution_space,
           int Rank = ViewType::Rank, typename iType = int64_t>
diff --git a/packages/kokkos/core/src/Kokkos_Crs.hpp b/packages/kokkos/core/src/Kokkos_Crs.hpp
index 1a10500b19a55f4f963807dd2cf1a28e6062f98c..897402d37643bf8876360b3e828c685c6251fe19 100644
--- a/packages/kokkos/core/src/Kokkos_Crs.hpp
+++ b/packages/kokkos/core/src/Kokkos_Crs.hpp
@@ -179,7 +179,9 @@ class GetCrsTransposeCounts {
     const closure_type closure(*this,
                                policy_type(0, index_type(in.entries.size())));
     closure.execute();
-    execution_space().fence();
+    execution_space().fence(
+        "Kokkos::Impl::GetCrsTransposeCounts::GetCrsTransposeCounts: fence "
+        "after functor execution");
   }
 };
 
@@ -261,7 +263,9 @@ class FillCrsTransposeEntries {
     using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
     const closure_type closure(*this, policy_type(0, index_type(in.numRows())));
     closure.execute();
-    execution_space().fence();
+    execution_space().fence(
+        "Kokkos::Impl::FillCrsTransposeEntries::FillCrsTransposeEntries: fence "
+        "after functor execution");
   }
 };
 
diff --git a/packages/kokkos/core/src/Kokkos_Cuda.hpp b/packages/kokkos/core/src/Kokkos_Cuda.hpp
index 7a218120bb7bb3b053335946ae25ad58c8a85e6d..c5a6b0f7d7d579e2ccad3c05f97d042d4ed63471 100644
--- a/packages/kokkos/core/src/Kokkos_Cuda.hpp
+++ b/packages/kokkos/core/src/Kokkos_Cuda.hpp
@@ -55,13 +55,13 @@
 
 #include <impl/Kokkos_AnalyzePolicy.hpp>
 #include <Kokkos_CudaSpace.hpp>
+#include <Cuda/Kokkos_Cuda_Error.hpp>  // CUDA_SAFE_CALL
 
 #include <Kokkos_Parallel.hpp>
 #include <Kokkos_TaskScheduler.hpp>
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_MemoryTraits.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
 #include <impl/Kokkos_HostSharedPtr.hpp>
 
@@ -184,8 +184,10 @@ class Cuda {
   /// method does not return until all dispatched functors on this
   /// device have completed.
   static void impl_static_fence();
+  static void impl_static_fence(const std::string&);
 
   void fence() const;
+  void fence(const std::string&) const;
 
   /** \brief  Return the maximum amount of concurrency.  */
   static int concurrency();
@@ -199,7 +201,7 @@ class Cuda {
 
   Cuda();
 
-  Cuda(cudaStream_t stream);
+  Cuda(cudaStream_t stream, bool manage_stream = false);
 
   //--------------------------------------------------------------------------
   //! \name Device-specific functions
@@ -246,7 +248,7 @@ class Cuda {
   inline Impl::CudaInternal* impl_internal_space_instance() const {
     return m_space_instance.get();
   }
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept;
 
  private:
   Kokkos::Impl::HostSharedPtr<Impl::CudaInternal> m_space_instance;
@@ -271,9 +273,28 @@ class CudaSpaceInitializer : public ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool all_spaces) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
+template <class DT, class... DP>
+struct ZeroMemset<Kokkos::Cuda, DT, DP...> {
+  ZeroMemset(const Kokkos::Cuda& exec_space_instance,
+             const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemsetAsync(
+        dst.data(), 0,
+        dst.size() * sizeof(typename View<DT, DP...>::value_type),
+        exec_space_instance.cuda_stream()));
+  }
+
+  ZeroMemset(const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaMemset(dst.data(), 0,
+                   dst.size() * sizeof(typename View<DT, DP...>::value_type)));
+  }
+};
 }  // namespace Impl
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/core/src/Kokkos_CudaSpace.hpp b/packages/kokkos/core/src/Kokkos_CudaSpace.hpp
index e10fae93c7ca01ce90f31b5d22ca9bff7d113884..910a8b2d7470b65b66d397a2507eb96723be447c 100644
--- a/packages/kokkos/core/src/Kokkos_CudaSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_CudaSpace.hpp
@@ -70,6 +70,12 @@ extern "C" void kokkos_impl_cuda_set_pin_uvm_to_host(bool);
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
+namespace Impl {
+
+template <typename T>
+struct is_cuda_type_space : public std::false_type {};
+
+}  // namespace Impl
 
 /** \brief  Cuda on-device memory management */
 
@@ -119,10 +125,12 @@ class CudaSpace {
   /**\brief Return Name of the MemorySpace */
   static constexpr const char* name() { return m_name; }
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   /*--------------------------------*/
   /** \brief  Error reporting for HostSpace attempt to access CudaSpace */
   KOKKOS_DEPRECATED static void access_error();
   KOKKOS_DEPRECATED static void access_error(const void* const);
+#endif
 
  private:
   int m_device;  ///< Which Cuda device
@@ -130,6 +138,10 @@ class CudaSpace {
   static constexpr const char* m_name = "Cuda";
   friend class Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
 };
+
+template <>
+struct Impl::is_cuda_type_space<CudaSpace> : public std::true_type {};
+
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -151,9 +163,11 @@ class CudaUVMSpace {
   /** \brief  If UVM capability is available */
   static bool available();
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   /*--------------------------------*/
   /** \brief  CudaUVMSpace specific routine */
   KOKKOS_DEPRECATED static int number_of_allocations();
+#endif
 
   /*--------------------------------*/
 
@@ -209,6 +223,9 @@ class CudaUVMSpace {
   static constexpr const char* m_name = "CudaUVM";
 };
 
+template <>
+struct Impl::is_cuda_type_space<CudaUVMSpace> : public std::true_type {};
+
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -271,6 +288,9 @@ class CudaHostPinnedSpace {
   /*--------------------------------*/
 };
 
+template <>
+struct Impl::is_cuda_type_space<CudaHostPinnedSpace> : public std::true_type {};
+
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -411,338 +431,107 @@ struct MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, Kokkos::CudaUVMSpace> {
 namespace Kokkos {
 namespace Impl {
 
+void DeepCopyCuda(void* dst, const void* src, size_t n);
+void DeepCopyAsyncCuda(const Cuda& instance, void* dst, const void* src,
+                       size_t n);
 void DeepCopyAsyncCuda(void* dst, const void* src, size_t n);
 
-template <>
-struct DeepCopy<CudaSpace, CudaSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Cuda&, void* dst, const void* src, size_t);
-};
-
-template <>
-struct DeepCopy<CudaSpace, HostSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Cuda&, void* dst, const void* src, size_t);
-};
-
-template <>
-struct DeepCopy<HostSpace, CudaSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Cuda&, void* dst, const void* src, size_t);
-};
-
-template <>
-struct DeepCopy<CudaUVMSpace, CudaUVMSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaUVMSpace, HostSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<HostSpace, CudaUVMSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaHostPinnedSpace, CudaHostPinnedSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaHostPinnedSpace, HostSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<HostSpace, CudaHostPinnedSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaUVMSpace, CudaSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaSpace, CudaUVMSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
+template <class MemSpace>
+struct DeepCopy<MemSpace, HostSpace, Cuda,
+                std::enable_if_t<is_cuda_type_space<MemSpace>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopyCuda(dst, src, n); }
   DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
+    DeepCopyAsyncCuda(instance, dst, src, n);
   }
 };
 
-template <>
-struct DeepCopy<CudaUVMSpace, CudaHostPinnedSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
+template <class MemSpace>
+struct DeepCopy<HostSpace, MemSpace, Cuda,
+                std::enable_if_t<is_cuda_type_space<MemSpace>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopyCuda(dst, src, n); }
   DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
+    DeepCopyAsyncCuda(instance, dst, src, n);
   }
 };
 
-template <>
-struct DeepCopy<CudaHostPinnedSpace, CudaUVMSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
+template <class MemSpace1, class MemSpace2>
+struct DeepCopy<MemSpace1, MemSpace2, Cuda,
+                std::enable_if_t<is_cuda_type_space<MemSpace1>::value &&
+                                 is_cuda_type_space<MemSpace2>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopyCuda(dst, src, n); }
   DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
+    DeepCopyAsyncCuda(instance, dst, src, n);
   }
 };
 
-template <>
-struct DeepCopy<CudaSpace, CudaHostPinnedSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaHostPinnedSpace, CudaSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaSpace, CudaSpace, ExecutionSpace> {
+template <class MemSpace1, class MemSpace2, class ExecutionSpace>
+struct DeepCopy<MemSpace1, MemSpace2, ExecutionSpace,
+                std::enable_if_t<is_cuda_type_space<MemSpace1>::value &&
+                                 is_cuda_type_space<MemSpace2>::value &&
+                                 !std::is_same<ExecutionSpace, Cuda>::value>> {
   inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
+    DeepCopyCuda(dst, src, n);
   }
 
   inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
                   size_t n) {
-    exec.fence();
+    exec.fence(fence_string());
     DeepCopyAsyncCuda(dst, src, n);
   }
-};
 
-template <class ExecutionSpace>
-struct DeepCopy<CudaSpace, HostSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, CudaSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaSpace, CudaUVMSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaSpace, CudaHostPinnedSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaUVMSpace, CudaSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaUVMSpace, CudaUVMSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaUVMSpace, CudaHostPinnedSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaUVMSpace, HostSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaHostPinnedSpace, CudaSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaHostPinnedSpace, CudaUVMSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<") + MemSpace1::name() + "Space, " +
+        MemSpace2::name() +
+        "Space, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
   }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<CudaHostPinnedSpace, CudaHostPinnedSpace, ExecutionSpace> {
+template <class MemSpace, class ExecutionSpace>
+struct DeepCopy<MemSpace, HostSpace, ExecutionSpace,
+                std::enable_if_t<is_cuda_type_space<MemSpace>::value &&
+                                 !std::is_same<ExecutionSpace, Cuda>::value>> {
   inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, HostSpace, Cuda>(dst, src, n);
+    DeepCopyCuda(dst, src, n);
   }
 
   inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
                   size_t n) {
-    exec.fence();
+    exec.fence(fence_string());
     DeepCopyAsyncCuda(dst, src, n);
   }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaHostPinnedSpace, HostSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, HostSpace, Cuda>(dst, src, n);
-  }
 
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<") + MemSpace::name() +
+        "Space, HostSpace, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
   }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, CudaUVMSpace, ExecutionSpace> {
+template <class MemSpace, class ExecutionSpace>
+struct DeepCopy<HostSpace, MemSpace, ExecutionSpace,
+                std::enable_if_t<is_cuda_type_space<MemSpace>::value &&
+                                 !std::is_same<ExecutionSpace, Cuda>::value>> {
   inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n);
+    DeepCopyCuda(dst, src, n);
   }
 
   inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
                   size_t n) {
-    exec.fence();
+    exec.fence(fence_string());
     DeepCopyAsyncCuda(dst, src, n);
   }
-};
 
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, CudaHostPinnedSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, HostSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<HostSpace, ") + MemSpace::name() +
+        "Space, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
   }
 };
 
diff --git a/packages/kokkos/core/src/Kokkos_DetectionIdiom.hpp b/packages/kokkos/core/src/Kokkos_DetectionIdiom.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9e060b343eb77ffd9783b2449def926704032334
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_DetectionIdiom.hpp
@@ -0,0 +1,116 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_DETECTION_IDIOM_HPP
+#define KOKKOS_DETECTION_IDIOM_HPP
+
+#include <impl/Kokkos_Utilities.hpp>  // void_t
+#include <type_traits>
+
+// NOTE This header implements the detection idiom from Version 2 of the C++
+// Extensions for Library Fundamentals, ISO/IEC TS 19568:2017
+
+// I deliberately omitted detected_or which does not fit well with the rest
+// of the specification. In my opinion, it should be removed from the TS.
+
+namespace Kokkos {
+
+namespace Impl {
+// base class for nonesuch to inherit from so it is not an aggregate
+struct nonesuch_base {};
+
+// primary template handles all types not supporting the archetypal Op
+template <class Default, class /*AlwaysVoid*/, template <class...> class Op,
+          class... /*Args*/>
+struct detector {
+  using value_t = std::false_type;
+  using type    = Default;
+};
+
+// specialization recognizes and handles only types supporting Op
+template <class Default, template <class...> class Op, class... Args>
+struct detector<Default, void_t<Op<Args...>>, Op, Args...> {
+  using value_t = std::true_type;
+  using type    = Op<Args...>;
+};
+}  // namespace Impl
+
+struct nonesuch : private Impl::nonesuch_base {
+  ~nonesuch()               = delete;
+  nonesuch(nonesuch const&) = delete;
+  void operator=(nonesuch const&) = delete;
+};
+
+template <template <class...> class Op, class... Args>
+using is_detected =
+    typename Impl::detector<nonesuch, void, Op, Args...>::value_t;
+
+template <template <class...> class Op, class... Args>
+using detected_t = typename Impl::detector<nonesuch, void, Op, Args...>::type;
+
+template <class Default, template <class...> class Op, class... Args>
+using detected_or_t = typename Impl::detector<Default, void, Op, Args...>::type;
+
+template <class Expected, template <class...> class Op, class... Args>
+using is_detected_exact = std::is_same<Expected, detected_t<Op, Args...>>;
+
+template <class To, template <class...> class Op, class... Args>
+using is_detected_convertible =
+    std::is_convertible<detected_t<Op, Args...>, To>;
+
+#ifdef KOKKOS_ENABLE_CXX17
+template <template <class...> class Op, class... Args>
+inline constexpr bool is_detected_v = is_detected<Op, Args...>::value;
+
+template <class Expected, template <class...> class Op, class... Args>
+inline constexpr bool is_detected_exact_v =
+    is_detected_exact<Expected, Op, Args...>::value;
+
+template <class Expected, template <class...> class Op, class... Args>
+inline constexpr bool is_detected_convertible_v =
+    is_detected_convertible<Expected, Op, Args...>::value;
+#endif
+
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp b/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp
index 55aed13670e69838d94fff2735d421cc49a11835..c88c1ada14e38bd4c3cf90c61fc7351cc27fc8ea 100644
--- a/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp
+++ b/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp
@@ -48,7 +48,6 @@
 #include <Kokkos_Core_fwd.hpp>
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_Error.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_AnalyzePolicy.hpp>
 #include <Kokkos_Concepts.hpp>
 #include <typeinfo>
diff --git a/packages/kokkos/core/src/Kokkos_HBWSpace.hpp b/packages/kokkos/core/src/Kokkos_HBWSpace.hpp
index d0366b599cf8c80c92812e386ced90f6fa77eb93..f6cdb2ec46cd4b5329a987e947ec356ff4efb0e9 100644
--- a/packages/kokkos/core/src/Kokkos_HBWSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_HBWSpace.hpp
@@ -287,7 +287,10 @@ struct DeepCopy<Kokkos::Experimental::HBWSpace, Kokkos::Experimental::HBWSpace,
   DeepCopy(void* dst, const void* src, size_t n) { memcpy(dst, src, n); }
 
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<Kokkos::Experimental::HBWSpace, "
+        "Kokkos::Experimental::HBWSpace,ExecutionSpace::DeepCopy: fence before "
+        "copy");
     memcpy(dst, src, n);
   }
 };
@@ -297,7 +300,9 @@ struct DeepCopy<HostSpace, Kokkos::Experimental::HBWSpace, ExecutionSpace> {
   DeepCopy(void* dst, const void* src, size_t n) { memcpy(dst, src, n); }
 
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<HostSpace, Kokkos::Experimental::HBWSpace, "
+        "ExecutionSpace>::DeepCopy: fence before copy");
     memcpy(dst, src, n);
   }
 };
@@ -307,7 +312,9 @@ struct DeepCopy<Kokkos::Experimental::HBWSpace, HostSpace, ExecutionSpace> {
   DeepCopy(void* dst, const void* src, size_t n) { memcpy(dst, src, n); }
 
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<Kokkos::Experimental::HBWSpace, HostSpace, "
+        "ExecutionSpace>::DeepCopy: fence before copy");
     memcpy(dst, src, n);
   }
 };
diff --git a/packages/kokkos/core/src/Kokkos_HIP.hpp b/packages/kokkos/core/src/Kokkos_HIP.hpp
index 33cf8321c80282d5346c66afb5ee9b4be589576b..09df4f2fed4d8c5499ec339391de0730474b1f80 100644
--- a/packages/kokkos/core/src/Kokkos_HIP.hpp
+++ b/packages/kokkos/core/src/Kokkos_HIP.hpp
@@ -54,7 +54,6 @@
 
 #include <Kokkos_HIP_Space.hpp>
 #include <Kokkos_Parallel.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 #include <HIP/Kokkos_HIP_Instance.hpp>
 #include <HIP/Kokkos_HIP_MDRangePolicy.hpp>
diff --git a/packages/kokkos/core/src/Kokkos_HIP_Space.hpp b/packages/kokkos/core/src/Kokkos_HIP_Space.hpp
index 17bd681aa4b7b7aa8d98bb8253c86db81de6ce05..d20d533645b2f6bfc721820cb7a43adf4434f8e8 100644
--- a/packages/kokkos/core/src/Kokkos_HIP_Space.hpp
+++ b/packages/kokkos/core/src/Kokkos_HIP_Space.hpp
@@ -58,6 +58,7 @@
 #include <Kokkos_HostSpace.hpp>
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_ScratchSpace.hpp>
+#include <HIP/Kokkos_HIP_Error.hpp>  // HIP_SAFE_CALL
 
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
@@ -67,6 +68,13 @@
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
+namespace Impl {
+
+template <typename T>
+struct is_hip_type_space : public std::false_type {};
+
+}  // namespace Impl
+
 namespace Experimental {
 /** \brief  HIP on-device memory management */
 
@@ -116,10 +124,12 @@ class HIPSpace {
   /**\brief Return Name of the MemorySpace */
   static constexpr const char* name() { return "HIP"; }
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   /*--------------------------------*/
   /** \brief  Error reporting for HostSpace attempt to access HIPSpace */
   KOKKOS_DEPRECATED static void access_error();
   KOKKOS_DEPRECATED static void access_error(const void* const);
+#endif
 
  private:
   int m_device;  ///< Which HIP device
@@ -129,6 +139,11 @@ class HIPSpace {
 };
 
 }  // namespace Experimental
+
+template <>
+struct Impl::is_hip_type_space<Experimental::HIPSpace> : public std::true_type {
+};
+
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -188,6 +203,11 @@ class HIPHostPinnedSpace {
   /*--------------------------------*/
 };
 }  // namespace Experimental
+
+template <>
+struct Impl::is_hip_type_space<Experimental::HIPHostPinnedSpace>
+    : public std::true_type {};
+
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -268,174 +288,116 @@ struct MemorySpaceAccess<Kokkos::Experimental::HIPHostPinnedSpace,
 namespace Kokkos {
 namespace Impl {
 
+void DeepCopyHIP(void* dst, const void* src, size_t n);
+void DeepCopyAsyncHIP(const Kokkos::Experimental::HIP& instance, void* dst,
+                      const void* src, size_t n);
 void DeepCopyAsyncHIP(void* dst, const void* src, size_t n);
 
-template <>
-struct DeepCopy<Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace,
-                Kokkos::Experimental::HIP> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src,
-           size_t);
-};
-
-template <>
-struct DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace,
-                Kokkos::Experimental::HIP> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src,
-           size_t);
-};
-
-template <>
-struct DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
-                Kokkos::Experimental::HIP> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src,
-           size_t);
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace,
-                ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::HIPSpace,
-                   Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIP>(
-        dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncHIP(dst, src, n);
+template <class MemSpace>
+struct DeepCopy<MemSpace, HostSpace, Kokkos::Experimental::HIP,
+                std::enable_if_t<is_hip_type_space<MemSpace>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopyHIP(dst, src, n); }
+  DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
+           const void* src, size_t n) {
+    DeepCopyAsyncHIP(instance, dst, src, n);
   }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
+template <class MemSpace>
+struct DeepCopy<HostSpace, MemSpace, Kokkos::Experimental::HIP,
+                std::enable_if_t<is_hip_type_space<MemSpace>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopyHIP(dst, src, n); }
+  DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
+           const void* src, size_t n) {
+    DeepCopyAsyncHIP(instance, dst, src, n);
   }
+};
 
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncHIP(dst, src, n);
+template <class MemSpace1, class MemSpace2>
+struct DeepCopy<MemSpace1, MemSpace2, Kokkos::Experimental::HIP,
+                std::enable_if_t<is_hip_type_space<MemSpace1>::value &&
+                                 is_hip_type_space<MemSpace2>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopyHIP(dst, src, n); }
+  DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
+           const void* src, size_t n) {
+    DeepCopyAsyncHIP(instance, dst, src, n);
   }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace, ExecutionSpace> {
+template <class MemSpace1, class MemSpace2, class ExecutionSpace>
+struct DeepCopy<
+    MemSpace1, MemSpace2, ExecutionSpace,
+    std::enable_if_t<
+        is_hip_type_space<MemSpace1>::value &&
+        is_hip_type_space<MemSpace2>::value &&
+        !std::is_same<ExecutionSpace, Kokkos::Experimental::HIP>::value>> {
   inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
+    DeepCopyHIP(dst, src, n);
   }
 
   inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
                   size_t n) {
-    exec.fence();
+    exec.fence(fence_string());
     DeepCopyAsyncHIP(dst, src, n);
   }
-};
-
-template <>
-struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-                Kokkos::Experimental::HIPHostPinnedSpace,
-                Kokkos::Experimental::HIP> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src,
-           size_t);
-};
-
-template <>
-struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace,
-                Kokkos::Experimental::HIP> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src,
-           size_t);
-};
-
-template <>
-struct DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace,
-                Kokkos::Experimental::HIP> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src,
-           size_t);
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HIPSpace,
-                Kokkos::Experimental::HIPHostPinnedSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
-  }
 
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncHIP(dst, src, n);
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<") + MemSpace1::name() + "Space, " +
+        MemSpace2::name() +
+        "Space, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
   }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-                Kokkos::Experimental::HIPSpace, ExecutionSpace> {
+template <class MemSpace, class ExecutionSpace>
+struct DeepCopy<
+    MemSpace, HostSpace, ExecutionSpace,
+    std::enable_if_t<
+        is_hip_type_space<MemSpace>::value &&
+        !std::is_same<ExecutionSpace, Kokkos::Experimental::HIP>::value>> {
   inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
+    DeepCopyHIP(dst, src, n);
   }
 
   inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
                   size_t n) {
-    exec.fence();
+    exec.fence(fence_string());
     DeepCopyAsyncHIP(dst, src, n);
   }
-};
 
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-                Kokkos::Experimental::HIPHostPinnedSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-                   Kokkos::Experimental::HIPHostPinnedSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncHIP(dst, src, n);
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<") + MemSpace::name() +
+        "Space, HostSpace, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
   }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace,
-                ExecutionSpace> {
+template <class MemSpace, class ExecutionSpace>
+struct DeepCopy<
+    HostSpace, MemSpace, ExecutionSpace,
+    std::enable_if_t<
+        is_hip_type_space<MemSpace>::value &&
+        !std::is_same<ExecutionSpace, Kokkos::Experimental::HIP>::value>> {
   inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
+    DeepCopyHIP(dst, src, n);
   }
 
   inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
                   size_t n) {
-    exec.fence();
+    exec.fence(fence_string());
     DeepCopyAsyncHIP(dst, src, n);
   }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace,
-                ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
-  }
 
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncHIP(dst, src, n);
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<HostSpace, ") + MemSpace::name() +
+        "Space, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
   }
 };
 }  // namespace Impl
@@ -536,7 +498,7 @@ class HIP {
   using scratch_memory_space = ScratchMemorySpace<HIP>;
 
   HIP();
-  HIP(hipStream_t stream);
+  HIP(hipStream_t stream, bool manage_stream = false);
 
   //@}
   //------------------------------------
@@ -558,8 +520,10 @@ class HIP {
    * until all dispatched functors on this device have completed.
    */
   static void impl_static_fence();
+  static void impl_static_fence(const std::string&);
 
   void fence() const;
+  void fence(const std::string&) const;
 
   hipStream_t hip_stream() const;
 
@@ -596,7 +560,7 @@ class HIP {
     return m_space_instance.get();
   }
 
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept;
 
  private:
   Kokkos::Impl::HostSharedPtr<Impl::HIPInternal> m_space_instance;
@@ -620,9 +584,28 @@ class HIPSpaceInitializer : public Kokkos::Impl::ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
+template <class DT, class... DP>
+struct ZeroMemset<Kokkos::Experimental::HIP, DT, DP...> {
+  ZeroMemset(const Kokkos::Experimental::HIP& exec_space,
+             const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipMemsetAsync(
+        dst.data(), 0,
+        dst.size() * sizeof(typename View<DT, DP...>::value_type),
+        exec_space.hip_stream()));
+  }
+
+  ZeroMemset(const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    KOKKOS_IMPL_HIP_SAFE_CALL(
+        hipMemset(dst.data(), 0,
+                  dst.size() * sizeof(typename View<DT, DP...>::value_type)));
+  }
+};
 }  // namespace Impl
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/core/src/Kokkos_HPX.hpp b/packages/kokkos/core/src/Kokkos_HPX.hpp
index 2100b49c116cfaecd35205aa60708ed1535578ca..236211864ee8f00f2bce884dc6a16666174bdadf 100644
--- a/packages/kokkos/core/src/Kokkos_HPX.hpp
+++ b/packages/kokkos/core/src/Kokkos_HPX.hpp
@@ -69,7 +69,6 @@
 #include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_FunctorAnalysis.hpp>
 #include <impl/Kokkos_Tools.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_TaskQueue.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
 
@@ -318,25 +317,50 @@ class HPX {
   }
 
   void impl_fence_instance() const {
-    if (hpx::threads::get_self_ptr() == nullptr) {
-      hpx::threads::run_as_hpx_thread([this]() { impl_get_future().wait(); });
-    } else {
-      impl_get_future().wait();
-    }
+    impl_fence_instance(
+        "Kokkos::Experimental::HPX::impl_fence_instance: Unnamed Instance "
+        "Fence");
+  }
+  void impl_fence_instance(const std::string &name) const {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event(name, *this, [&]() {
+      if (hpx::threads::get_self_ptr() == nullptr) {
+        hpx::threads::run_as_hpx_thread([this]() { impl_get_future().wait(); });
+      } else {
+        impl_get_future().wait();
+      }
+    });
   }
 
   void impl_fence_all_instances() const {
-    hpx::util::yield_while(
-        []() { return m_active_parallel_region_count.load() != 0; });
+    impl_fence_instance(
+        "Kokkos::Experimental::HPX::impl_fence_all_instances: Unnamed Global "
+        "HPX Fence");
+  }
+  void impl_fence_all_instances(const std::string &namename) const {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event(name, *this, [&]() {
+      hpx::util::yield_while(
+          []() { return m_active_parallel_region_count.load() != 0; });
+    });
   }
 #endif
 
   void fence() const {
 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
     if (m_mode == instance_mode::global) {
-      impl_fence_all_instances();
+      impl_fence_all_instances(
+          "Kokkos::Experimental::HPX::fence: Unnamed Global HPX Fence");
+    } else {
+      impl_fence_instance(
+          "Kokkos::Experimental::HPX::fence: Unnamed HPX Instance Fence");
+    }
+#endif
+  }
+  void fence(const std::string &name) const {
+#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
+    if (m_mode == instance_mode::global) {
+      impl_fence_all_instances(name);
     } else {
-      impl_fence_instance();
+      impl_fence_instance(name);
     }
 #endif
   }
@@ -464,6 +488,7 @@ class HPXSpaceInitializer : public ExecSpaceInitializerBase {
   void initialize(const InitArguments &args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string &) final;
   void print_configuration(std::ostream &msg, const bool detail) final;
 };
 
@@ -491,7 +516,9 @@ inline void dispatch_execute_task(Closure *closure,
   }
 
   if (force_synchronous) {
-    instance.fence();
+    instance.fence(
+        "Kokkos::Experimental::Impl::HPX::dispatch_execute_task: fence due to "
+        "forced syncronizations");
   }
 }
 #else
diff --git a/packages/kokkos/core/src/Kokkos_HostSpace.hpp b/packages/kokkos/core/src/Kokkos_HostSpace.hpp
index ba69fbad393ee391eff2b59c34d4ae526fa7af29..c96cf5fbbe1b3a07f75da83a2557d2bbe4cb38c0 100644
--- a/packages/kokkos/core/src/Kokkos_HostSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_HostSpace.hpp
@@ -299,6 +299,20 @@ namespace Kokkos {
 
 namespace Impl {
 
+template <class DT, class... DP>
+struct ZeroMemset<typename HostSpace::execution_space, DT, DP...> {
+  ZeroMemset(const typename HostSpace::execution_space&,
+             const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type& value)
+      : ZeroMemset(dst, value) {}
+
+  ZeroMemset(const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    using ValueType = typename View<DT, DP...>::value_type;
+    std::memset(dst.data(), 0, sizeof(ValueType) * dst.size());
+  }
+};
+
 template <class ExecutionSpace>
 struct DeepCopy<HostSpace, HostSpace, ExecutionSpace> {
   DeepCopy(void* dst, const void* src, size_t n) {
@@ -306,9 +320,13 @@ struct DeepCopy<HostSpace, HostSpace, ExecutionSpace> {
   }
 
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<HostSpace, HostSpace, "
+        "ExecutionSpace>::DeepCopy: fence before copy");
     hostspace_parallel_deepcopy(dst, src, n);
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<HostSpace, HostSpace, "
+        "ExecutionSpace>::DeepCopy: fence after copy");
   }
 };
 
diff --git a/packages/kokkos/core/src/Kokkos_Layout.hpp b/packages/kokkos/core/src/Kokkos_Layout.hpp
index 778b4f08109a5b2d617c2ff89298c9e92dbccb61..cfd77ea50fedcb5766ace9feb488c4c0f6238e89 100644
--- a/packages/kokkos/core/src/Kokkos_Layout.hpp
+++ b/packages/kokkos/core/src/Kokkos_Layout.hpp
@@ -50,7 +50,6 @@
 
 #include <cstddef>
 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 namespace Kokkos {
 
@@ -89,6 +88,16 @@ struct LayoutLeft {
                                 size_t N3 = 0, size_t N4 = 0, size_t N5 = 0,
                                 size_t N6 = 0, size_t N7 = 0)
       : dimension{N0, N1, N2, N3, N4, N5, N6, N7} {}
+
+  friend bool operator==(const LayoutLeft& left, const LayoutLeft& right) {
+    for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank)
+      if (left.dimension[rank] != right.dimension[rank]) return false;
+    return true;
+  }
+
+  friend bool operator!=(const LayoutLeft& left, const LayoutLeft& right) {
+    return !(left == right);
+  }
 };
 
 //----------------------------------------------------------------------------
@@ -123,6 +132,16 @@ struct LayoutRight {
                                  size_t N3 = 0, size_t N4 = 0, size_t N5 = 0,
                                  size_t N6 = 0, size_t N7 = 0)
       : dimension{N0, N1, N2, N3, N4, N5, N6, N7} {}
+
+  friend bool operator==(const LayoutRight& left, const LayoutRight& right) {
+    for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank)
+      if (left.dimension[rank] != right.dimension[rank]) return false;
+    return true;
+  }
+
+  friend bool operator!=(const LayoutRight& left, const LayoutRight& right) {
+    return !(left == right);
+  }
 };
 
 //----------------------------------------------------------------------------
@@ -184,6 +203,18 @@ struct LayoutStride {
                                   size_t S7 = 0)
       : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, stride{S0, S1, S2, S3,
                                                           S4, S5, S6, S7} {}
+
+  friend bool operator==(const LayoutStride& left, const LayoutStride& right) {
+    for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank)
+      if (left.dimension[rank] != right.dimension[rank] ||
+          left.stride[rank] != right.stride[rank])
+        return false;
+    return true;
+  }
+
+  friend bool operator!=(const LayoutStride& left, const LayoutStride& right) {
+    return !(left == right);
+  }
 };
 
 // ===================================================================================
@@ -229,18 +260,6 @@ struct LayoutTiled {
   static_assert(IsPowerOfTwo,
                 "LayoutTiled must be given power-of-two tile dimensions");
 
-#if 0
-  static_assert( (Impl::is_integral_power_of_two(ArgN0) ) &&
-                 (Impl::is_integral_power_of_two(ArgN1) ) &&
-                 (Impl::is_integral_power_of_two(ArgN2) || (ArgN2 == 0) ) &&
-                 (Impl::is_integral_power_of_two(ArgN3) || (ArgN3 == 0) ) &&
-                 (Impl::is_integral_power_of_two(ArgN4) || (ArgN4 == 0) ) &&
-                 (Impl::is_integral_power_of_two(ArgN5) || (ArgN5 == 0) ) &&
-                 (Impl::is_integral_power_of_two(ArgN6) || (ArgN6 == 0) ) &&
-                 (Impl::is_integral_power_of_two(ArgN7) || (ArgN7 == 0) )
-               , "LayoutTiled must be given power-of-two tile dimensions" );
-#endif
-
   using array_layout = LayoutTiled<OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3,
                                    ArgN4, ArgN5, ArgN6, ArgN7, IsPowerOfTwo>;
   static constexpr Iterate outer_pattern = OuterP;
@@ -270,6 +289,16 @@ struct LayoutTiled {
                                  size_t argN4 = 0, size_t argN5 = 0,
                                  size_t argN6 = 0, size_t argN7 = 0)
       : dimension{argN0, argN1, argN2, argN3, argN4, argN5, argN6, argN7} {}
+
+  friend bool operator==(const LayoutTiled& left, const LayoutTiled& right) {
+    for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank)
+      if (left.dimension[rank] != right.dimension[rank]) return false;
+    return true;
+  }
+
+  friend bool operator!=(const LayoutTiled& left, const LayoutTiled& right) {
+    return !(left == right);
+  }
 };
 
 }  // namespace Experimental
diff --git a/packages/kokkos/core/src/Kokkos_Macros.hpp b/packages/kokkos/core/src/Kokkos_Macros.hpp
index 0d0185346540bf929b4305d6ad496b2f02e39c69..8d0fd925a27070dcd97160ba25ba19f06d3842b2 100644
--- a/packages/kokkos/core/src/Kokkos_Macros.hpp
+++ b/packages/kokkos/core/src/Kokkos_Macros.hpp
@@ -53,11 +53,12 @@
  *  KOKKOS_ENABLE_HPX                 Kokkos::Experimental::HPX execution space
  *  KOKKOS_ENABLE_OPENMP              Kokkos::OpenMP execution space
  *  KOKKOS_ENABLE_OPENMPTARGET        Kokkos::Experimental::OpenMPTarget
- * execution space KOKKOS_ENABLE_HWLOC               HWLOC library is available.
+ *                                    execution space
+ *  KOKKOS_ENABLE_HIP                 Kokkos::Experimental::HIP execution space
+ *  KOKKOS_ENABLE_SYCL                Kokkos::Experimental::SYCL execution space
+ *  KOKKOS_ENABLE_HWLOC               HWLOC library is available.
  *  KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK  Insert array bounds checks, is expensive!
- *  KOKKOS_ENABLE_MPI                 Negotiate MPI/execution space
- * interactions. KOKKOS_ENABLE_CUDA_UVM            Use CUDA UVM for Cuda memory
- * space.
+ *  KOKKOS_ENABLE_CUDA_UVM            Use CUDA UVM for Cuda memory space.
  */
 
 #ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H
@@ -211,6 +212,11 @@
 #define KOKKOS_ENABLE_PRAGMA_SIMD 1
 #endif
 
+// FIXME Workaround for ICE with intel 17,18,19 in Trilinos
+#if (KOKKOS_COMPILER_INTEL <= 1900)
+#define KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS
+#endif
+
 // FIXME_SYCL
 #if !defined(KOKKOS_ENABLE_SYCL)
 #define KOKKOS_ENABLE_PRAGMA_IVDEP 1
@@ -220,11 +226,19 @@
 #define KOKKOS_MEMORY_ALIGNMENT 64
 #endif
 
+#if defined(_WIN32)
+#define KOKKOS_RESTRICT __restrict
+#else
 #define KOKKOS_RESTRICT __restrict__
+#endif
 
 #ifndef KOKKOS_IMPL_ALIGN_PTR
+#if defined(_WIN32)
+#define KOKKOS_IMPL_ALIGN_PTR(size) __declspec(align_value(size))
+#else
 #define KOKKOS_IMPL_ALIGN_PTR(size) __attribute__((align_value(size)))
 #endif
+#endif
 
 #if (1700 > KOKKOS_COMPILER_INTEL)
 #error "Compiling with Intel version earlier than 17.0 is not supported."
@@ -507,24 +521,44 @@
 #if defined(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE)
 #define KOKKOS_ENABLE_TASKDAG
 #endif
+// FIXME_SYCL Tasks not implemented
 #elif !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_SYCL)
 #define KOKKOS_ENABLE_TASKDAG
 #endif
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#define KOKKOS_IMPL_CUDA_VERSION_9_WORKAROUND
-#if (__CUDA_ARCH__)
-#define KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-#endif
-#endif
-
 #define KOKKOS_INVALID_INDEX (~std::size_t(0))
 
 #define KOKKOS_IMPL_CTOR_DEFAULT_ARG KOKKOS_INVALID_INDEX
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 #define KOKKOS_CONSTEXPR_14 constexpr
-#define KOKKOS_DEPRECATED [[deprecated]]
 #define KOKKOS_DEPRECATED_TRAILING_ATTRIBUTE
+#endif
+
+// Guard intel compiler version <= 1900
+// intel error #2651: attribute does not apply to any entity
+// using <deprecated_type> KOKKOS_DEPRECATED = ...
+#if defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) && !defined(__NVCC__) && \
+    (KOKKOS_COMPILER_INTEL > 1900)
+#define KOKKOS_DEPRECATED [[deprecated]]
+#define KOKKOS_DEPRECATED_WITH_COMMENT(comment) [[deprecated(comment)]]
+#else
+#define KOKKOS_DEPRECATED
+#define KOKKOS_DEPRECATED_WITH_COMMENT(comment)
+#endif
+
+#define KOKKOS_IMPL_STRINGIFY(x) #x
+#define KOKKOS_IMPL_TOSTRING(x) KOKKOS_IMPL_STRINGIFY(x)
+
+#ifdef _MSC_VER
+#define KOKKOS_IMPL_DO_PRAGMA(x) __pragma(x)
+#define KOKKOS_IMPL_WARNING(desc) \
+  KOKKOS_IMPL_DO_PRAGMA(message(  \
+      __FILE__ "(" KOKKOS_IMPL_TOSTRING(__LINE__) ") : warning: " #desc))
+#else
+#define KOKKOS_IMPL_DO_PRAGMA(x) _Pragma(#x)
+#define KOKKOS_IMPL_WARNING(desc) KOKKOS_IMPL_DO_PRAGMA(message(#desc))
+#endif
 
 // DJS 05/28/2019: Bugfix: Issue 2155
 // Use KOKKOS_ENABLE_CUDA_LDG_INTRINSIC to avoid memory leak in RandomAccess
@@ -541,7 +575,7 @@
 
 #if (defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) ||  \
      defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_PGI)) && \
-    !defined(KOKKOS_COMPILER_MSVC)
+    !defined(_WIN32)
 #define KOKKOS_IMPL_ENABLE_STACKTRACE
 #define KOKKOS_IMPL_ENABLE_CXXABI
 #endif
@@ -553,7 +587,8 @@
 #undef __CUDA_ARCH__
 #endif
 
-#if defined(KOKKOS_COMPILER_MSVC) && !defined(KOKKOS_COMPILER_CLANG)
+#if (defined(KOKKOS_COMPILER_MSVC) && !defined(KOKKOS_COMPILER_CLANG)) || \
+    (defined(KOKKOS_COMPILER_INTEL) && defined(_WIN32))
 #define KOKKOS_THREAD_LOCAL __declspec(thread)
 #else
 #define KOKKOS_THREAD_LOCAL __thread
diff --git a/packages/kokkos/core/src/Kokkos_MasterLock.hpp b/packages/kokkos/core/src/Kokkos_MasterLock.hpp
index 3c45e131a0fba6e39f3f97ef2fd67451b9aef76c..cbfbb92660ba9d75f4aadb67196d969902524a4f 100644
--- a/packages/kokkos/core/src/Kokkos_MasterLock.hpp
+++ b/packages/kokkos/core/src/Kokkos_MasterLock.hpp
@@ -47,6 +47,8 @@
 
 #include <Kokkos_Macros.hpp>
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+
 namespace Kokkos {
 namespace Experimental {
 
@@ -72,4 +74,6 @@ class MasterLock;
 }  // namespace Experimental
 }  // namespace Kokkos
 
+#endif
+
 #endif  // KOKKOS_MASTER_LOCK_HPP
diff --git a/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp b/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp
index 50223651e7d189e07cd94f9bf48eb6c5dcaa62d2..50fde82d77a7c37dfa0d5f3d1a565df470f680e0 100644
--- a/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp
+++ b/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp
@@ -55,116 +55,224 @@
 #endif
 
 namespace Kokkos {
+
+namespace Impl {
+template <class T, bool = std::is_integral<T>::value>
+struct promote {
+  using type = double;
+};
+template <class T>
+struct promote<T, false> {};
+template <>
+struct promote<long double> {
+  using type = long double;
+};
+template <>
+struct promote<double> {
+  using type = double;
+};
+template <>
+struct promote<float> {
+  using type = float;
+};
+template <class T>
+using promote_t = typename promote<T>::type;
+template <class T, class U>
+struct promote_2 {
+  using type = decltype(promote_t<T>() + promote_t<U>());
+};
+template <class T, class U>
+using promote_2_t = typename promote_2<T, U>::type;
+}  // namespace Impl
+
 namespace Experimental {
 
 #if defined(KOKKOS_ENABLE_SYCL)
-#define NAMESPACE_MATH_FUNCTIONS sycl
+#define KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE sycl
 #else
-#define NAMESPACE_MATH_FUNCTIONS std
+#define KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE std
 #endif
 
-#define KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, RETURNTYPE, ARGTYPE) \
-  KOKKOS_INLINE_FUNCTION RETURNTYPE FUNC(ARGTYPE x) {                        \
-    using NAMESPACE_MATH_FUNCTIONS::FUNC;                                    \
-    return FUNC(x);                                                          \
+// NOTE long double overloads are not available on the device
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \
+    defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET)
+#else
+#define KOKKOS_IMPL_MATH_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+#endif
+
+#if defined(KOKKOS_IMPL_MATH_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS)
+
+#define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC)                                 \
+  KOKKOS_INLINE_FUNCTION float FUNC(float x) {                                \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  KOKKOS_INLINE_FUNCTION double FUNC(double x) {                              \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  KOKKOS_INLINE_FUNCTION long double FUNC(long double x) {                    \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  KOKKOS_INLINE_FUNCTION float FUNC##f(float x) {                             \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  KOKKOS_INLINE_FUNCTION long double FUNC##l(long double x) {                 \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  template <class T>                                                          \
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_integral<T>::value, double> \
+  FUNC(T x) {                                                                 \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(static_cast<double>(x));                                      \
   }
 
-#define KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, RETURNTYPE)              \
-  template <typename Integer,                                              \
-            typename = std::enable_if_t<std::is_integral<Integer>::value>> \
-  KOKKOS_INLINE_FUNCTION RETURNTYPE FUNC(Integer x) {                      \
-    return Kokkos::Experimental::FUNC(static_cast<double>(x));             \
+#define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC)                              \
+  KOKKOS_INLINE_FUNCTION bool FUNC(float x) {                               \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(x);                                                         \
+  }                                                                         \
+  KOKKOS_INLINE_FUNCTION bool FUNC(double x) {                              \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(x);                                                         \
+  }                                                                         \
+  KOKKOS_INLINE_FUNCTION bool FUNC(long double x) {                         \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(x);                                                         \
+  }                                                                         \
+  template <class T>                                                        \
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_integral<T>::value, bool> \
+  FUNC(T x) {                                                               \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(static_cast<double>(x));                                    \
   }
 
-#define KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, TYPE) \
-  KOKKOS_INLINE_FUNCTION TYPE FUNC(TYPE x, TYPE y) {           \
-    using NAMESPACE_MATH_FUNCTIONS::FUNC;                      \
-    return FUNC(x, y);                                         \
+#define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC)                               \
+  KOKKOS_INLINE_FUNCTION float FUNC(float x, float y) {                      \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
+    return FUNC(x, y);                                                       \
+  }                                                                          \
+  KOKKOS_INLINE_FUNCTION double FUNC(double x, double y) {                   \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
+    return FUNC(x, y);                                                       \
+  }                                                                          \
+  KOKKOS_INLINE_FUNCTION long double FUNC(long double x, long double y) {    \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
+    return FUNC(x, y);                                                       \
+  }                                                                          \
+  KOKKOS_INLINE_FUNCTION float FUNC##f(float x, float y) {                   \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
+    return FUNC(x, y);                                                       \
+  }                                                                          \
+  KOKKOS_INLINE_FUNCTION long double FUNC##l(long double x, long double y) { \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
+    return FUNC(x, y);                                                       \
+  }                                                                          \
+  template <class T1, class T2>                                              \
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_arithmetic<T1>::value &&   \
+                                              std::is_arithmetic<T2>::value, \
+                                          Kokkos::Impl::promote_2_t<T1, T2>> \
+  FUNC(T1 x, T2 y) {                                                         \
+    using Promoted = Kokkos::Impl::promote_2_t<T1, T2>;                      \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
+    return FUNC(static_cast<Promoted>(x), static_cast<Promoted>(y));         \
   }
 
-// NOTE long double overloads are not available on the device
-#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \
-    defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET)
+#else  // long double overloads are not available
 
-#define KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC(FUNC)                         \
-  template <typename Arithmetic1, typename Arithmetic2,                      \
-            typename = std::enable_if_t<                                     \
-                std::is_arithmetic<Arithmetic1>::value &&                    \
-                std::is_arithmetic<Arithmetic2>::value &&                    \
-                !std::is_same<Arithmetic1, long double>::value &&            \
-                !std::is_same<Arithmetic2, long double>::value>>             \
-  KOKKOS_INLINE_FUNCTION double FUNC(Arithmetic1 x, Arithmetic2 y) {         \
-    return Kokkos::Experimental::FUNC(                                       \
-        static_cast<std::conditional_t<std::is_integral<Arithmetic1>::value, \
-                                       double, Arithmetic1>>(x),             \
-        static_cast<std::conditional_t<std::is_integral<Arithmetic2>::value, \
-                                       double, Arithmetic2>>(y));            \
+#define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC)                                 \
+  KOKKOS_INLINE_FUNCTION float FUNC(float x) {                                \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  KOKKOS_INLINE_FUNCTION double FUNC(double x) {                              \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  KOKKOS_INLINE_FUNCTION float FUNC##f(float x) {                             \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  template <class T>                                                          \
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_integral<T>::value, double> \
+  FUNC(T x) {                                                                 \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(static_cast<double>(x));                                      \
   }
 
-#define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC)                     \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, float, float)   \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, double, double) \
-  KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, double)
-
-#define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC)                  \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, float)  \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, double) \
-  KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, bool)
-
-#define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC)             \
-  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, float)  \
-  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, double) \
-  KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC(FUNC)
-
-#define KOKKOS_IMPL_MATH_NAN()                                        \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nanf, float, char const*) \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nan, double, char const*)
-
-#else  // long double overloads are available
-
-#define KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC(FUNC)                         \
-  template <typename Arithmetic1, typename Arithmetic2,                      \
-            typename =                                                       \
-                std::enable_if_t<std::is_arithmetic<Arithmetic1>::value &&   \
-                                 std::is_arithmetic<Arithmetic2>::value>,    \
-            typename Promoted = std::conditional_t<                          \
-                std::is_same<Arithmetic1, long double>::value ||             \
-                    std::is_same<Arithmetic2, long double>::value,           \
-                long double, double>>                                        \
-  KOKKOS_INLINE_FUNCTION Promoted FUNC(Arithmetic1 x, Arithmetic2 y) {       \
-    return Kokkos::Experimental::FUNC(                                       \
-        static_cast<std::conditional_t<std::is_integral<Arithmetic1>::value, \
-                                       double, Arithmetic1>>(x),             \
-        static_cast<std::conditional_t<std::is_integral<Arithmetic2>::value, \
-                                       double, Arithmetic2>>(y));            \
+#define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC)                              \
+  KOKKOS_INLINE_FUNCTION bool FUNC(float x) {                               \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(x);                                                         \
+  }                                                                         \
+  KOKKOS_INLINE_FUNCTION bool FUNC(double x) {                              \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(x);                                                         \
+  }                                                                         \
+  template <class T>                                                        \
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_integral<T>::value, bool> \
+  FUNC(T x) {                                                               \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(static_cast<double>(x));                                    \
   }
 
-#define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC)                               \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, float, float)             \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, double, double)           \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, long double, long double) \
-  KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, double)
-
-#define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC)                       \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, float)       \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, double)      \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, long double) \
-  KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, bool)
-
-#define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC)                  \
-  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, float)       \
-  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, double)      \
-  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, long double) \
-  KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC(FUNC)
-
-#define KOKKOS_IMPL_MATH_NAN()                                        \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nanf, float, char const*) \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nan, double, char const*) \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nanl, long double, char const*)
+#define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC)                          \
+  KOKKOS_INLINE_FUNCTION float FUNC(float x, float y) {                 \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                   \
+    return FUNC(x, y);                                                  \
+  }                                                                     \
+  KOKKOS_INLINE_FUNCTION double FUNC(double x, double y) {              \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                   \
+    return FUNC(x, y);                                                  \
+  }                                                                     \
+  KOKKOS_INLINE_FUNCTION float FUNC##f(float x, float y) {              \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                   \
+    return FUNC(x, y);                                                  \
+  }                                                                     \
+  template <class T1, class T2>                                         \
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<                              \
+      std::is_arithmetic<T1>::value && std::is_arithmetic<T2>::value && \
+          !std::is_same<T1, long double>::value &&                      \
+          !std::is_same<T2, long double>::value,                        \
+      Kokkos::Impl::promote_2_t<T1, T2>>                                \
+  FUNC(T1 x, T2 y) {                                                    \
+    using Promoted = Kokkos::Impl::promote_2_t<T1, T2>;                 \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                   \
+    return FUNC(static_cast<Promoted>(x), static_cast<Promoted>(y));    \
+  }
 
 #endif
 
 // Basic operations
+KOKKOS_INLINE_FUNCTION int abs(int n) {
+  using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
+  return abs(n);
+}
+KOKKOS_INLINE_FUNCTION long abs(long n) {
+  using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
+  return abs(n);
+}
+KOKKOS_INLINE_FUNCTION long long abs(long long n) {
+  using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
+  return abs(n);
+}
+KOKKOS_INLINE_FUNCTION float abs(float x) {
+  using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
+  return abs(x);
+}
+KOKKOS_INLINE_FUNCTION double abs(double x) {
+  using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
+  return abs(x);
+}
+#if defined(KOKKOS_IMPL_MATH_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS)
+KOKKOS_INLINE_FUNCTION long double abs(long double x) {
+  using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
+  return abs(x);
+}
+#endif
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(fabs)
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmod)
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(remainder)
@@ -172,7 +280,18 @@ KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmin)
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmax)
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(fdim)
 #ifndef KOKKOS_ENABLE_SYCL
-KOKKOS_IMPL_MATH_NAN()
+KOKKOS_INLINE_FUNCTION float nanf(char const* arg) { return ::nanf(arg); }
+KOKKOS_INLINE_FUNCTION double nan(char const* arg) { return ::nan(arg); }
+#if defined(KOKKOS_IMPL_MATH_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS)
+KOKKOS_INLINE_FUNCTION long double nanl(char const* arg) { return ::nanl(arg); }
+#endif
+#else
+// FIXME_SYCL
+// sycl::nan does not follow the C/C++ standard library and takes an unsigned
+// integer as argument.  The current implementation does not attempt to convert
+// the character string arg into the quiet NaN value.
+KOKKOS_INLINE_FUNCTION float nanf(char const*) { return sycl::nan(0u); }
+KOKKOS_INLINE_FUNCTION double nan(char const*) { return sycl::nan(0ul); }
 #endif
 // Power functions
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(pow)
@@ -211,6 +330,7 @@ KOKKOS_IMPL_MATH_UNARY_FUNCTION(lgamma)
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(ceil)
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(floor)
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(trunc)
+// FIXME_SYCL not available as of current SYCL specification v1.2.1
 #ifndef KOKKOS_ENABLE_SYCL
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(nearbyint)
 #endif
@@ -219,14 +339,12 @@ KOKKOS_IMPL_MATH_UNARY_PREDICATE(isfinite)
 KOKKOS_IMPL_MATH_UNARY_PREDICATE(isinf)
 KOKKOS_IMPL_MATH_UNARY_PREDICATE(isnan)
 
-#undef KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT
-#undef KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL
-#undef KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT
-#undef KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC
+#undef KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE
+#undef KOKKOS_IMPL_MATH_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
 #undef KOKKOS_IMPL_MATH_UNARY_FUNCTION
 #undef KOKKOS_IMPL_MATH_UNARY_PREDICATE
 #undef KOKKOS_IMPL_MATH_BINARY_FUNCTION
-#undef KOKKOS_IMPL_MATH_NAN
+
 }  // namespace Experimental
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/core/src/Kokkos_MathematicalSpecialFunctions.hpp b/packages/kokkos/core/src/Kokkos_MathematicalSpecialFunctions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7bcea91c86790dd52265addb1ca651adbe21a966
--- /dev/null
+++ b/packages/kokkos/core/src/Kokkos_MathematicalSpecialFunctions.hpp
@@ -0,0 +1,1280 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_MATHEMATICAL_SPECIAL_FUNCTIONS_HPP
+#define KOKKOS_MATHEMATICAL_SPECIAL_FUNCTIONS_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <cmath>
+#include <algorithm>
+#include <type_traits>
+#include <Kokkos_MathematicalFunctions.hpp>
+#include <Kokkos_NumericTraits.hpp>
+#include <Kokkos_Complex.hpp>
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+namespace Kokkos {
+namespace Experimental {
+
+//! Compute exponential integral E1(x) (x > 0).
+template <class RealType>
+KOKKOS_INLINE_FUNCTION RealType expint1(RealType x) {
+  // This function is a conversion of the corresponding Fortran program in
+  // S. Zhang & J. Jin "Computation of Special Functions" (Wiley, 1996).
+  using Kokkos::Experimental::epsilon;
+  using Kokkos::Experimental::exp;
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::log;
+  using Kokkos::Experimental::pow;
+
+  RealType e1;
+
+  if (x < 0) {
+    e1 = -infinity<RealType>::value;
+  } else if (x == 0.0) {
+    e1 = infinity<RealType>::value;
+  } else if (x <= 1.0) {
+    e1         = 1.0;
+    RealType r = 1.0;
+    for (int k = 1; k <= 25; k++) {
+      RealType k_real = static_cast<RealType>(k);
+      r               = -r * k_real * x / pow(k_real + 1.0, 2.0);
+      e1              = e1 + r;
+      if (fabs(r) <= fabs(e1) * epsilon<RealType>::value) break;
+    }
+    e1 = -0.5772156649015328 - log(x) + x * e1;
+  } else {
+    int m       = 20 + static_cast<int>(80.0 / x);
+    RealType t0 = 0.0;
+    for (int k = m; k >= 1; k--) {
+      RealType k_real = static_cast<RealType>(k);
+      t0              = k_real / (1.0 + k_real / (x + t0));
+    }
+    e1 = exp(-x) * (1.0 / (x + t0));
+  }
+  return e1;
+}
+
+//! Compute error function erf(z) for z=cmplx(x,y).
+template <class RealType>
+KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> erf(
+    const Kokkos::complex<RealType>& z) {
+  // This function is a conversion of the corresponding Fortran program written
+  // by D.E. Amos, May,1974. D.E. Amos' revisions of Jan 86 incorporated by
+  // Ken Damrau on 27-Jan-1986 14:37:13
+  //
+  // Reference: NBS HANDBOOK OF MATHEMATICAL FUNCTIONS, AMS 55, By
+  //           M. ABRAMOWITZ AND I.A. STEGUN, December,1955.
+  // Summary:
+  //  If x < 0, z is replaced by -z and all computation is done in the right
+  //  half lane, except for z inside the circle abs(z)<=2, since
+  //  erf(-z)=-erf(z). The regions for computation are divided as follows
+  //      (1)  abs(z)<=2 - Power series, NBS Handbook, p. 298
+  //      (2)  abs(z)>2 and x>1 - continued fraction, NBS Handbook, p. 298
+  //      (3)  abs(z)>2 and 0<=x<=1 and abs(y)<6 - series, NBS Handbook, p. 299
+  //      (4)  abs(z)>2 and 0<=x<=1 and abs(y)>=6 - asymtotic expansion
+  //  Error condition: abs(z^2) > 670 is a fatal overflow error
+  using Kokkos::Experimental::cos;
+  using Kokkos::Experimental::epsilon;
+  using Kokkos::Experimental::exp;
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::sin;
+
+  using CmplxType = Kokkos::complex<RealType>;
+
+  auto const inf = infinity<RealType>::value;
+  auto const tol = epsilon<RealType>::value;
+
+  const RealType fnorm = 1.12837916709551;
+  const RealType gnorm = 0.564189583547756;
+  const RealType eh    = 0.606530659712633;
+  const RealType ef    = 0.778800783071405;
+  // const RealType tol   = 1.0e-13;
+  const RealType pi = M_PI;
+
+  CmplxType cans;
+
+  RealType az = Kokkos::abs(z);
+  if (az <= 2.0) {  // Series for abs(z)<=2.0
+    CmplxType cz    = z * z;
+    CmplxType accum = CmplxType(1.0, 0.0);
+    CmplxType term  = accum;
+    RealType ak     = 1.5;
+    for (int i = 1; i <= 35; i++) {
+      term  = term * cz / ak;
+      accum = accum + term;
+      if (Kokkos::abs(term) <= tol) break;
+      ak = ak + 1.0;
+    }
+    cz          = -cz;
+    RealType er = cz.real();
+    RealType ei = cz.imag();
+    accum       = accum * z * fnorm;
+    cz          = exp(er) * CmplxType(cos(ei), sin(ei));
+    cans        = accum * cz;
+  }       // end (az <= 2.0)
+  else {  //(az > 2.0)
+    CmplxType zp = z;
+    if (z.real() < 0.0) zp = -z;
+    CmplxType cz = zp * zp;
+    RealType xp  = zp.real();
+    RealType yp  = zp.imag();
+    if (xp > 1.0) {
+      // continued fraction for erfc(z), abs(Z)>2
+      int n          = static_cast<int>(100.0 / az + 5.0);
+      int fn         = n;
+      CmplxType term = cz;
+      for (int i = 1; i <= n; i++) {
+        RealType fnh = fn - 0.5;
+        term         = cz + (fnh * term) / (fn + term);
+        fn           = fn - 1;
+      }
+      if (Kokkos::abs(cz) > 670.0) return CmplxType(inf, inf);
+      cz              = -cz;
+      RealType er     = cz.real();
+      RealType ei     = cz.imag();
+      cz              = exp(er) * CmplxType(cos(ei), sin(ei));
+      CmplxType accum = zp * gnorm * cz;
+      cans            = 1.0 - accum / term;
+      if (z.real() < 0.0) cans = -cans;
+    }       // end (xp > 1.0)
+    else {  //(xp <= 1.0)
+      if (fabs(yp) <
+          6.0) {  // Series (3) for abs(z)>2 and 0<=xp<=1 and abs(yp)<6
+        RealType s1   = 0.0;
+        RealType s2   = 0.0;
+        RealType x2   = xp * xp;
+        RealType fx2  = 4.0 * x2;
+        RealType tx   = xp + xp;
+        RealType xy   = xp * yp;
+        RealType sxyh = sin(xy);
+        RealType sxy  = sin(xy + xy);
+        RealType cxy  = cos(xy + xy);
+        RealType fn   = 1.0;
+        RealType fnh  = 0.5;
+        RealType ey   = exp(yp);
+        RealType en   = ey;
+        RealType ehn  = eh;
+        RealType un   = ef;
+        RealType vn   = 1.0;
+        for (int i = 1; i <= 50; i++) {
+          RealType ren = 1.0 / en;
+          RealType csh = en + ren;
+          RealType tm  = xp * csh;
+          RealType ssh = en - ren;
+          RealType tmp = fnh * ssh;
+          RealType rn  = tx - tm * cxy + tmp * sxy;
+          RealType ain = tm * sxy + tmp * cxy;
+          RealType cf  = un / (vn + fx2);
+          rn           = cf * rn;
+          ain          = cf * ain;
+          s1           = s1 + rn;
+          s2           = s2 + ain;
+          if ((fabs(rn) + fabs(ain)) < tol * (fabs(s1) + fabs(s2))) break;
+          un  = un * ehn * ef;
+          ehn = ehn * eh;
+          en  = en * ey;
+          vn  = vn + fn + fn + 1.0;
+          fnh = fnh + 0.5;
+          fn  = fn + 1.0;
+        }
+        s1 = s1 + s1;
+        s2 = s2 + s2;
+        if (z.real() == 0.0)
+          s2 = s2 + yp;
+        else {
+          s1 = s1 + sxyh * sxyh / xp;
+          s2 = s2 + sxy / tx;
+        }
+        // Power series for erf(xp), 0<=xp<=1
+        RealType w  = 1.0;
+        RealType ak = 1.5;
+        RealType tm = 1.0;
+        for (int i = 1; i <= 17; i++) {
+          tm = tm * x2 / ak;
+          w  = w + tm;
+          if (tm <= tol) break;
+          ak = ak + 1.0;
+        }
+        RealType ex = exp(-x2);
+        w           = w * xp * fnorm * ex;
+        RealType cf = ex / pi;
+        s1          = cf * s1 + w;
+        s2          = cf * s2;
+        cans        = CmplxType(s1, s2);
+        if (z.real() < 0.0) cans = -cans;
+      }       // end (abs(yp) < 6.0)
+      else {  //(abs(YP)>=6.0)
+        // Asymtotic expansion for 0<=xp<=1 and abs(yp)>=6
+        CmplxType rcz   = 0.5 / cz;
+        CmplxType accum = CmplxType(1.0, 0.0);
+        CmplxType term  = accum;
+        RealType ak     = 1.0;
+        for (int i = 1; i <= 35; i++) {
+          term  = -term * ak * rcz;
+          accum = accum + term;
+          if (Kokkos::abs(term) / Kokkos::abs(accum) <= tol) break;
+          ak = ak + 2.0;
+        }
+        accum       = accum * gnorm / zp;
+        cz          = -cz;
+        RealType er = cz.real();
+        if (fabs(er) > 670.0) return CmplxType(inf, inf);
+        RealType ei = cz.imag();
+        cz          = exp(er) * CmplxType(cos(ei), sin(ei));
+        cans        = 1.0 - accum * cz;
+        if (z.real() < 0.0) cans = -cans;
+      }  // end (abs(YP)>=6.0)
+    }    // end (xp <= 1.0)
+  }      // end (az > 2.0)
+  return cans;
+}
+
+//! Compute scaled complementary error function erfcx(z)=exp(z^2)*erfc(z)
+//! for z=cmplx(x,y).
+template <class RealType>
+KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> erfcx(
+    const Kokkos::complex<RealType>& z) {
+  // This function is a conversion of the corresponding Fortran program written
+  // by D.E. Amos, May,1974. D.E. Amos' revisions of Jan 86 incorporated by
+  // Ken Damrau on 27-Jan-1986 14:37:13
+  //
+  // Reference: NBS HANDBOOK OF MATHEMATICAL FUNCTIONS, AMS 55, By
+  //           M. ABRAMOWITZ AND I.A. STEGUN, December,1955.
+  // Summary:
+  //  If x < 0, z is replaced by -z and all computation is done in the right
+  //  half lane, except for z inside the circle abs(z)<=2, since
+  //  erfc(-z)=2-erfc(z). The regions for computation are divided as follows
+  //      (1)  abs(z)<=2 - Power series, NBS Handbook, p. 298
+  //      (2)  abs(z)>2 and x>1 - continued fraction, NBS Handbook, p. 298
+  //      (3)  abs(z)>2 and 0<=x<=1 and abs(y)<6 - series, NBS Handbook, p. 299
+  //      (4)  abs(z)>2 and 0<=x<=1 and abs(y)>=6 - asymtotic expansion
+  // Error condition: abs(z^2) > 670 is a fatal overflow error when x<0
+  using Kokkos::Experimental::cos;
+  using Kokkos::Experimental::epsilon;
+  using Kokkos::Experimental::exp;
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::isinf;
+  using Kokkos::Experimental::sin;
+
+  using CmplxType = Kokkos::complex<RealType>;
+
+  auto const inf = infinity<RealType>::value;
+  auto const tol = epsilon<RealType>::value;
+
+  const RealType fnorm = 1.12837916709551;
+  const RealType gnorm = 0.564189583547756;
+  const RealType eh    = 0.606530659712633;
+  const RealType ef    = 0.778800783071405;
+  // const RealType tol   = 1.0e-13;
+  const RealType pi = M_PI;
+
+  CmplxType cans;
+
+  if ((isinf(z.real())) && (z.real() > 0)) {
+    cans = CmplxType(0.0, 0.0);
+    return cans;
+  }
+  if ((isinf(z.real())) && (z.real() < 0)) {
+    cans = CmplxType(inf, inf);
+    return cans;
+  }
+
+  RealType az = Kokkos::abs(z);
+  if (az <= 2.0) {  // Series for abs(z)<=2.0
+    CmplxType cz    = z * z;
+    CmplxType accum = CmplxType(1.0, 0.0);
+    CmplxType term  = accum;
+    RealType ak     = 1.5;
+    for (int i = 1; i <= 35; i++) {
+      term  = term * cz / ak;
+      accum = accum + term;
+      if (Kokkos::abs(term) <= tol) break;
+      ak = ak + 1.0;
+    }
+    cz          = -cz;
+    RealType er = cz.real();
+    RealType ei = cz.imag();
+    accum       = accum * z * fnorm;
+    cz          = exp(er) * CmplxType(cos(ei), sin(ei));
+    cans        = 1.0 / cz - accum;
+  }       // end (az <= 2.0)
+  else {  //(az > 2.0)
+    CmplxType zp = z;
+    if (z.real() < 0.0) zp = -z;
+    CmplxType cz = zp * zp;
+    RealType xp  = zp.real();
+    RealType yp  = zp.imag();
+    if (xp > 1.0) {
+      // continued fraction for erfc(z), abs(z)>2
+      int n          = static_cast<int>(100.0 / az + 5.0);
+      int fn         = n;
+      CmplxType term = cz;
+      for (int i = 1; i <= n; i++) {
+        RealType fnh = fn - 0.5;
+        term         = cz + (fnh * term) / (fn + term);
+        fn           = fn - 1;
+      }
+      cans = zp * gnorm / term;
+      if (z.real() >= 0.0) return cans;
+      if (Kokkos::abs(cz) > 670.0) return CmplxType(inf, inf);
+      ;
+      cz          = -cz;
+      RealType er = cz.real();
+      RealType ei = cz.imag();
+      cz          = exp(er) * CmplxType(cos(ei), sin(ei));
+      cz          = 1.0 / cz;
+      cans        = cz + cz - cans;
+    }       // end (xp > 1.0)
+    else {  //(xp <= 1.0)
+      if (fabs(yp) <
+          6.0) {  // Series (3) for abs(z)>2 and 0<=xp<=1 and abs(yp)<6
+        RealType s1   = 0.0;
+        RealType s2   = 0.0;
+        RealType x2   = xp * xp;
+        RealType fx2  = 4.0 * x2;
+        RealType tx   = xp + xp;
+        RealType xy   = xp * yp;
+        RealType sxyh = sin(xy);
+        RealType sxy  = sin(xy + xy);
+        RealType cxy  = cos(xy + xy);
+        RealType fn   = 1.0;
+        RealType fnh  = 0.5;
+        RealType ey   = exp(yp);
+        RealType en   = ey;
+        RealType ehn  = eh;
+        RealType un   = ef;
+        RealType vn   = 1.0;
+        for (int i = 1; i <= 50; i++) {
+          RealType ren = 1.0 / en;
+          RealType csh = en + ren;
+          RealType tm  = xp * csh;
+          RealType ssh = en - ren;
+          RealType tmp = fnh * ssh;
+          RealType rn  = tx - tm * cxy + tmp * sxy;
+          RealType ain = tm * sxy + tmp * cxy;
+          RealType cf  = un / (vn + fx2);
+          rn           = cf * rn;
+          ain          = cf * ain;
+          s1           = s1 + rn;
+          s2           = s2 + ain;
+          if ((fabs(rn) + fabs(ain)) < tol * (fabs(s1) + fabs(s2))) break;
+          un  = un * ehn * ef;
+          ehn = ehn * eh;
+          en  = en * ey;
+          vn  = vn + fn + fn + 1.0;
+          fnh = fnh + 0.5;
+          fn  = fn + 1.0;
+        }
+        s1 = s1 + s1;
+        s2 = s2 + s2;
+        if (z.real() == 0.0)
+          s2 = s2 + yp;
+        else {
+          s1 = s1 + sxyh * sxyh / xp;
+          s2 = s2 + sxy / tx;
+        }
+        // Power series for erf(xp), 0<=xp<=1
+        RealType w  = 1.0;
+        RealType ak = 1.5;
+        RealType tm = 1.0;
+        for (int i = 1; i <= 17; i++) {
+          tm = tm * x2 / ak;
+          w  = w + tm;
+          if (tm <= tol) break;
+          ak = ak + 1.0;
+        }
+        RealType ex   = exp(-x2);
+        w             = w * xp * fnorm * ex;
+        CmplxType rcz = CmplxType(cxy, sxy);
+        RealType y2   = yp * yp;
+        cz            = exp(x2 - y2) * rcz;
+        rcz           = exp(-y2) * rcz;
+        if (z.real() >= 0.0)
+          cans = cz * (1.0 - w) - rcz * CmplxType(s1, s2) / pi;
+        else
+          cans = cz * (1.0 + w) + rcz * CmplxType(s1, s2) / pi;
+      }       // end (abs(yp) < 6.0)
+      else {  //(abs(YP)>=6.0)
+        // Asymtotic expansion for 0<=xp<=1 and abs(yp)>=6
+        CmplxType rcz   = 0.5 / cz;
+        CmplxType accum = CmplxType(1.0, 0.0);
+        CmplxType term  = accum;
+        RealType ak     = 1.0;
+        for (int i = 1; i <= 35; i++) {
+          term  = -term * ak * rcz;
+          accum = accum + term;
+          if (Kokkos::abs(term) / Kokkos::abs(accum) <= tol) break;
+          ak = ak + 2.0;
+        }
+        accum = accum * gnorm / zp;
+        if (z.real() < 0.0) accum = -accum;
+        cans = accum;
+      }  // end (abs(YP)>=6.0)
+    }    // end (xp <= 1.0)
+  }      // end (az > 2.0)
+  return cans;
+}
+
+//! Compute scaled complementary error function erfcx(x)=exp(x^2)*erfc(x)
+//! for real x
+template <class RealType>
+KOKKOS_INLINE_FUNCTION RealType erfcx(RealType x) {
+  using CmplxType = Kokkos::complex<RealType>;
+  // Note: using erfcx(complex) for now
+  // TODO: replace with an implementation of erfcx(real)
+  CmplxType zin  = CmplxType(x, 0.0);
+  CmplxType zout = erfcx(zin);
+  return zout.real();
+}
+
+//! Compute Bessel function J0(z) of the first kind of order zero
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_j0(const CmplxType& z,
+                                               const RealType& joint_val = 25,
+                                               const IntType& bw_start   = 70) {
+  // This function is converted and modified from the corresponding Fortran
+  // program CJYNB in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  // Input :  z         --- Complex argument
+  //         joint_val --- Joint point of abs(z) separating small and large
+  //                       argument regions
+  //         bw_start  --- Starting point for backward recurrence
+  // Output:  cbj0      --- J0(z)
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::pow;
+
+  CmplxType cbj0;
+  const RealType pi    = M_PI;
+  const RealType a[12] = {
+      -0.703125e-01,           0.112152099609375e+00,   -0.5725014209747314e+00,
+      0.6074042001273483e+01,  -0.1100171402692467e+03, 0.3038090510922384e+04,
+      -0.1188384262567832e+06, 0.6252951493434797e+07,  -0.4259392165047669e+09,
+      0.3646840080706556e+11,  -0.3833534661393944e+13, 0.4854014686852901e+15};
+  const RealType b[12] = {0.732421875e-01,        -0.2271080017089844e+00,
+                          0.1727727502584457e+01, -0.2438052969955606e+02,
+                          0.5513358961220206e+03, -0.1825775547429318e+05,
+                          0.8328593040162893e+06, -0.5006958953198893e+08,
+                          0.3836255180230433e+10, -0.3649010818849833e+12,
+                          0.4218971570284096e+14, -0.5827244631566907e+16};
+
+  RealType r2p = 2.0 / pi;
+  RealType a0  = Kokkos::abs(z);
+  RealType y0  = fabs(z.imag());
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cbj0 = CmplxType(1.0, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:25)
+      CmplxType cbs = CmplxType(0.0, 0.0);
+      CmplxType csu = CmplxType(0.0, 0.0);
+      CmplxType csv = CmplxType(0.0, 0.0);
+      CmplxType cf2 = CmplxType(0.0, 0.0);
+      CmplxType cf1 = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 70)
+        cf                    = 2.0 * (k + 1.0) / z * cf1 - cf2;
+        RealType tmp_exponent = static_cast<RealType>(k / 2);
+        if (k == 0) cbj0 = cf;
+        if ((k == 2 * (k / 2)) && (k != 0)) {
+          if (y0 <= 1.0)
+            cbs = cbs + 2.0 * cf;
+          else
+            cbs = cbs + pow(-1.0, tmp_exponent) * 2.0 * cf;
+          csu = csu + pow(-1.0, tmp_exponent) * cf / k;
+        } else if (k > 1) {
+          csv = csv + pow(-1.0, tmp_exponent) * k / (k * k - 1.0) * cf;
+        }
+        cf2 = cf1;
+        cf1 = cf;
+      }
+      if (y0 <= 1.0)
+        cs0 = cbs + cf;
+      else
+        cs0 = (cbs + cf) / Kokkos::cos(z);
+      cbj0 = cbj0 / cs0;
+    } else {  // Using asymptotic expansion (5.2.5) for |z|>joint_val
+              // (default:25)
+      CmplxType ct1 = z1 - 0.25 * pi;
+      CmplxType cp0 = CmplxType(1.0, 0.0);
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.9)
+        cp0 = cp0 + a[k - 1] * Kokkos::pow(z1, -2.0 * k);
+      }
+      CmplxType cq0 = -0.125 / z1;
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.10)
+        cq0 = cq0 + b[k - 1] * Kokkos::pow(z1, -2.0 * k - 1);
+      }
+      CmplxType cu = Kokkos::sqrt(r2p / z1);
+      cbj0         = cu * (cp0 * Kokkos::cos(ct1) - cq0 * Kokkos::sin(ct1));
+    }
+  }
+  return cbj0;
+}
+
+//! Compute Bessel function Y0(z) of the second kind of order zero
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_y0(const CmplxType& z,
+                                               const RealType& joint_val = 25,
+                                               const IntType& bw_start   = 70) {
+  // This function is converted and modified from the corresponding Fortran
+  // program CJYNB in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cby0      --- Y0(z)
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::pow;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType cby0, cbj0;
+  const RealType pi    = M_PI;
+  const RealType el    = 0.57721566490153286060651209008240;
+  const RealType a[12] = {
+      -0.703125e-01,           0.112152099609375e+00,   -0.5725014209747314e+00,
+      0.6074042001273483e+01,  -0.1100171402692467e+03, 0.3038090510922384e+04,
+      -0.1188384262567832e+06, 0.6252951493434797e+07,  -0.4259392165047669e+09,
+      0.3646840080706556e+11,  -0.3833534661393944e+13, 0.4854014686852901e+15};
+  const RealType b[12] = {0.732421875e-01,        -0.2271080017089844e+00,
+                          0.1727727502584457e+01, -0.2438052969955606e+02,
+                          0.5513358961220206e+03, -0.1825775547429318e+05,
+                          0.8328593040162893e+06, -0.5006958953198893e+08,
+                          0.3836255180230433e+10, -0.3649010818849833e+12,
+                          0.4218971570284096e+14, -0.5827244631566907e+16};
+
+  RealType r2p = 2.0 / pi;
+  RealType a0  = Kokkos::abs(z);
+  RealType y0  = fabs(z.imag());
+  CmplxType ci = CmplxType(0.0, 1.0);
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cby0 = -CmplxType(inf, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:25)
+      CmplxType cbs = CmplxType(0.0, 0.0);
+      CmplxType csu = CmplxType(0.0, 0.0);
+      CmplxType csv = CmplxType(0.0, 0.0);
+      CmplxType cf2 = CmplxType(0.0, 0.0);
+      CmplxType cf1 = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0, ce;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 70)
+        cf                    = 2.0 * (k + 1.0) / z * cf1 - cf2;
+        RealType tmp_exponent = static_cast<RealType>(k / 2);
+        if (k == 0) cbj0 = cf;
+        if ((k == 2 * (k / 2)) && (k != 0)) {
+          if (y0 <= 1.0)
+            cbs = cbs + 2.0 * cf;
+          else
+            cbs = cbs + pow(-1.0, tmp_exponent) * 2.0 * cf;
+          csu = csu + pow(-1.0, tmp_exponent) * cf / k;
+        } else if (k > 1) {
+          csv = csv + pow(-1.0, tmp_exponent) * k / (k * k - 1.0) * cf;
+        }
+        cf2 = cf1;
+        cf1 = cf;
+      }
+      if (y0 <= 1.0)
+        cs0 = cbs + cf;
+      else
+        cs0 = (cbs + cf) / Kokkos::cos(z);
+      cbj0 = cbj0 / cs0;
+      ce   = Kokkos::log(z / 2.0) + el;
+      cby0 = r2p * (ce * cbj0 - 4.0 * csu / cs0);
+    } else {  // Using asymptotic expansion (5.2.6) for |z|>joint_val
+              // (default:25)
+      CmplxType ct1 = z1 - 0.25 * pi;
+      CmplxType cp0 = CmplxType(1.0, 0.0);
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.9)
+        cp0 = cp0 + a[k - 1] * Kokkos::pow(z1, -2.0 * k);
+      }
+      CmplxType cq0 = -0.125 / z1;
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.10)
+        cq0 = cq0 + b[k - 1] * Kokkos::pow(z1, -2.0 * k - 1);
+      }
+      CmplxType cu = Kokkos::sqrt(r2p / z1);
+      cbj0         = cu * (cp0 * Kokkos::cos(ct1) - cq0 * Kokkos::sin(ct1));
+      cby0         = cu * (cp0 * Kokkos::sin(ct1) + cq0 * Kokkos::cos(ct1));
+
+      if (z.real() < 0.0) {  // Apply (5.4.2)
+        if (z.imag() < 0.0) cby0 = cby0 - 2.0 * ci * cbj0;
+        if (z.imag() >= 0.0) cby0 = cby0 + 2.0 * ci * cbj0;
+      }
+    }
+  }
+  return cby0;
+}
+
+//! Compute Bessel function J1(z) of the first kind of order one
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_j1(const CmplxType& z,
+                                               const RealType& joint_val = 25,
+                                               const IntType& bw_start   = 70) {
+  // This function is converted and modified from the corresponding Fortran
+  // program CJYNB in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cbj1      --- J1(z)
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::pow;
+
+  CmplxType cbj1;
+  const RealType pi     = M_PI;
+  const RealType a1[12] = {0.1171875e+00,          -0.144195556640625e+00,
+                           0.6765925884246826e+00, -0.6883914268109947e+01,
+                           0.1215978918765359e+03, -0.3302272294480852e+04,
+                           0.1276412726461746e+06, -0.6656367718817688e+07,
+                           0.4502786003050393e+09, -0.3833857520742790e+11,
+                           0.4011838599133198e+13, -0.5060568503314727e+15};
+  const RealType b1[12] = {
+      -0.1025390625e+00,       0.2775764465332031e+00,  -0.1993531733751297e+01,
+      0.2724882731126854e+02,  -0.6038440767050702e+03, 0.1971837591223663e+05,
+      -0.8902978767070678e+06, 0.5310411010968522e+08,  -0.4043620325107754e+10,
+      0.3827011346598605e+12,  -0.4406481417852278e+14, 0.6065091351222699e+16};
+
+  RealType r2p = 2.0 / pi;
+  RealType a0  = Kokkos::abs(z);
+  RealType y0  = fabs(z.imag());
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cbj1 = CmplxType(0.0, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:25)
+      CmplxType cbs = CmplxType(0.0, 0.0);
+      CmplxType csu = CmplxType(0.0, 0.0);
+      CmplxType csv = CmplxType(0.0, 0.0);
+      CmplxType cf2 = CmplxType(0.0, 0.0);
+      CmplxType cf1 = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 70)
+        cf                    = 2.0 * (k + 1.0) / z * cf1 - cf2;
+        RealType tmp_exponent = static_cast<RealType>(k / 2);
+        if (k == 1) cbj1 = cf;
+        if ((k == 2 * (k / 2)) && (k != 0)) {
+          if (y0 <= 1.0)
+            cbs = cbs + 2.0 * cf;
+          else
+            cbs = cbs + pow(-1.0, tmp_exponent) * 2.0 * cf;
+          csu = csu + pow(-1.0, tmp_exponent) * cf / k;
+        } else if (k > 1) {
+          csv = csv + pow(-1.0, tmp_exponent) * k / (k * k - 1.0) * cf;
+        }
+        cf2 = cf1;
+        cf1 = cf;
+      }
+      if (y0 <= 1.0)
+        cs0 = cbs + cf;
+      else
+        cs0 = (cbs + cf) / Kokkos::cos(z);
+      cbj1 = cbj1 / cs0;
+    } else {  // Using asymptotic expansion (5.2.5) for |z|>joint_val
+              // (default:25)
+      CmplxType ct2 = z1 - 0.75 * pi;
+      CmplxType cp1 = CmplxType(1.0, 0.0);
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.11)
+        cp1 = cp1 + a1[k - 1] * Kokkos::pow(z1, -2.0 * k);
+      }
+      CmplxType cq1 = 0.375 / z1;
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.12)
+        cq1 = cq1 + b1[k - 1] * Kokkos::pow(z1, -2.0 * k - 1);
+      }
+      CmplxType cu = Kokkos::sqrt(r2p / z1);
+      cbj1         = cu * (cp1 * Kokkos::cos(ct2) - cq1 * Kokkos::sin(ct2));
+
+      if (real(z) < 0.0) {  // Apply (5.4.2)
+        cbj1 = -cbj1;
+      }
+    }
+  }
+  return cbj1;
+}
+
+//! Compute Bessel function Y1(z) of the second kind of order one
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_y1(const CmplxType& z,
+                                               const RealType& joint_val = 25,
+                                               const IntType& bw_start   = 70) {
+  // This function is converted and modified from the corresponding Fortran
+  // program CJYNB in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cby1      --- Y1(z)
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::pow;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType cby1, cbj0, cbj1, cby0;
+  const RealType pi     = M_PI;
+  const RealType el     = 0.57721566490153286060651209008240;
+  const RealType a1[12] = {0.1171875e+00,          -0.144195556640625e+00,
+                           0.6765925884246826e+00, -0.6883914268109947e+01,
+                           0.1215978918765359e+03, -0.3302272294480852e+04,
+                           0.1276412726461746e+06, -0.6656367718817688e+07,
+                           0.4502786003050393e+09, -0.3833857520742790e+11,
+                           0.4011838599133198e+13, -0.5060568503314727e+15};
+  const RealType b1[12] = {
+      -0.1025390625e+00,       0.2775764465332031e+00,  -0.1993531733751297e+01,
+      0.2724882731126854e+02,  -0.6038440767050702e+03, 0.1971837591223663e+05,
+      -0.8902978767070678e+06, 0.5310411010968522e+08,  -0.4043620325107754e+10,
+      0.3827011346598605e+12,  -0.4406481417852278e+14, 0.6065091351222699e+16};
+
+  RealType r2p = 2.0 / pi;
+  RealType a0  = Kokkos::abs(z);
+  RealType y0  = fabs(z.imag());
+  CmplxType ci = CmplxType(0.0, 1.0);
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cby1 = -CmplxType(inf, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:25)
+      CmplxType cbs = CmplxType(0.0, 0.0);
+      CmplxType csu = CmplxType(0.0, 0.0);
+      CmplxType csv = CmplxType(0.0, 0.0);
+      CmplxType cf2 = CmplxType(0.0, 0.0);
+      CmplxType cf1 = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0, ce;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 70)
+        cf                    = 2.0 * (k + 1.0) / z * cf1 - cf2;
+        RealType tmp_exponent = static_cast<RealType>(k / 2);
+        if (k == 1) cbj1 = cf;
+        if (k == 0) cbj0 = cf;
+        if ((k == 2 * (k / 2)) && (k != 0)) {
+          if (y0 <= 1.0)
+            cbs = cbs + 2.0 * cf;
+          else
+            cbs = cbs + pow(-1.0, tmp_exponent) * 2.0 * cf;
+          csu = csu + pow(-1.0, tmp_exponent) * cf / k;
+        } else if (k > 1) {
+          csv = csv + pow(-1.0, tmp_exponent) * k / (k * k - 1.0) * cf;
+        }
+        cf2 = cf1;
+        cf1 = cf;
+      }
+      if (y0 <= 1.0)
+        cs0 = cbs + cf;
+      else
+        cs0 = (cbs + cf) / Kokkos::cos(z);
+      cbj0 = cbj0 / cs0;
+      ce   = Kokkos::log(z / 2.0) + el;
+      cby0 = r2p * (ce * cbj0 - 4.0 * csu / cs0);
+      cbj1 = cbj1 / cs0;
+      cby1 = (cbj1 * cby0 - 2.0 / (pi * z)) / cbj0;
+    } else {  // Using asymptotic expansion (5.2.5) for |z|>joint_val
+              // (default:25)
+      CmplxType ct2 = z1 - 0.75 * pi;
+      CmplxType cp1 = CmplxType(1.0, 0.0);
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.11)
+        cp1 = cp1 + a1[k - 1] * Kokkos::pow(z1, -2.0 * k);
+      }
+      CmplxType cq1 = 0.375 / z1;
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.12)
+        cq1 = cq1 + b1[k - 1] * Kokkos::pow(z1, -2.0 * k - 1);
+      }
+      CmplxType cu = Kokkos::sqrt(r2p / z1);
+      cbj1         = cu * (cp1 * Kokkos::cos(ct2) - cq1 * Kokkos::sin(ct2));
+      cby1         = cu * (cp1 * Kokkos::sin(ct2) + cq1 * Kokkos::cos(ct2));
+
+      if (z.real() < 0.0) {  // Apply (5.4.2)
+        if (z.imag() < 0.0) cby1 = -(cby1 - 2.0 * ci * cbj1);
+        if (z.imag() >= 0.0) cby1 = -(cby1 + 2.0 * ci * cbj1);
+      }
+    }
+  }
+  return cby1;
+}
+
+//! Compute modified Bessel function I0(z) of the first kind of order zero
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_i0(const CmplxType& z,
+                                               const RealType& joint_val = 25,
+                                               const IntType& bw_start   = 70) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CIKNB and CIK01 in S. Zhang & J. Jin "Computation of Special
+  // Functions" (Wiley, 1996).
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cbi0      --- I0(z)
+  CmplxType cbi0;
+  const RealType pi    = M_PI;
+  const RealType a[12] = {0.125,
+                          7.03125e-2,
+                          7.32421875e-2,
+                          1.1215209960938e-1,
+                          2.2710800170898e-1,
+                          5.7250142097473e-1,
+                          1.7277275025845e0,
+                          6.0740420012735e0,
+                          2.4380529699556e1,
+                          1.1001714026925e2,
+                          5.5133589612202e2,
+                          3.0380905109224e3};
+
+  RealType a0  = Kokkos::abs(z);
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cbi0 = CmplxType(1.0, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:25)
+      CmplxType cbs = CmplxType(0.0, 0.0);
+      // CmplxType csk0 = CmplxType(0.0,0.0);
+      CmplxType cf0 = CmplxType(0.0, 0.0);
+      CmplxType cf1 = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 70)
+        cf = 2.0 * (k + 1.0) * cf1 / z1 + cf0;
+        if (k == 0) cbi0 = cf;
+        // if ((k == 2*(k/2)) && (k != 0)) {
+        //  csk0 = csk0+4.0*cf/static_cast<RealType>(k);
+        //}
+        cbs = cbs + 2.0 * cf;
+        cf0 = cf1;
+        cf1 = cf;
+      }
+      cs0  = Kokkos::exp(z1) / (cbs - cf);
+      cbi0 = cbi0 * cs0;
+    } else {  // Using asymptotic expansion (6.2.1) for |z|>joint_val
+              // (default:25)
+      CmplxType ca = Kokkos::exp(z1) / Kokkos::sqrt(2.0 * pi * z1);
+      cbi0         = CmplxType(1.0, 0.0);
+      CmplxType zr = 1.0 / z1;
+      for (int k = 1; k <= 12; k++) {
+        cbi0 = cbi0 + a[k - 1] * Kokkos::pow(zr, 1.0 * k);
+      }
+      cbi0 = ca * cbi0;
+    }
+  }
+  return cbi0;
+}
+
+//! Compute modified Bessel function K0(z) of the second kind of order zero
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_k0(const CmplxType& z,
+                                               const RealType& joint_val = 9,
+                                               const IntType& bw_start   = 30) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CIKNB and CIK01 in S. Zhang & J. Jin "Computation of Special
+  // Functions" (Wiley, 1996).
+  //    Purpose: Compute modified Bessel function K0(z) of the second kind of
+  //             order zero for a complex argument
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cbk0      --- K0(z)
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::pow;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType cbk0, cbi0;
+  const RealType pi = M_PI;
+  const RealType el = 0.57721566490153286060651209008240;
+
+  RealType a0  = Kokkos::abs(z);
+  CmplxType ci = CmplxType(0.0, 1.0);
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cbk0 = CmplxType(inf, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:9)
+      CmplxType cbs  = CmplxType(0.0, 0.0);
+      CmplxType csk0 = CmplxType(0.0, 0.0);
+      CmplxType cf0  = CmplxType(0.0, 0.0);
+      CmplxType cf1  = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 30)
+        cf = 2.0 * (k + 1.0) * cf1 / z1 + cf0;
+        if (k == 0) cbi0 = cf;
+        if ((k == 2 * (k / 2)) && (k != 0)) {
+          csk0 = csk0 + 4.0 * cf / static_cast<RealType>(k);
+        }
+        cbs = cbs + 2.0 * cf;
+        cf0 = cf1;
+        cf1 = cf;
+      }
+      cs0  = Kokkos::exp(z1) / (cbs - cf);
+      cbi0 = cbi0 * cs0;
+      cbk0 = -(Kokkos::log(0.5 * z1) + el) * cbi0 + cs0 * csk0;
+    } else {  // Using asymptotic expansion (6.2.2) for |z|>joint_val
+              // (default:9)
+      CmplxType ca0  = Kokkos::sqrt(pi / (2.0 * z1)) * Kokkos::exp(-z1);
+      CmplxType cbkl = CmplxType(1.0, 0.0);
+      CmplxType cr   = CmplxType(1.0, 0.0);
+      for (int k = 1; k <= 30; k++) {
+        cr   = 0.125 * cr * (0.0 - pow(2.0 * k - 1.0, 2.0)) / (k * z1);
+        cbkl = cbkl + cr;
+      }
+      cbk0 = ca0 * cbkl;
+    }
+    if (z.real() < 0.0) {  // Apply (6.4.4)
+      if (z.imag() < 0.0)
+        cbk0 = cbk0 + ci * pi * cyl_bessel_i0<CmplxType, RealType, IntType>(z);
+      if (z.imag() >= 0.0)
+        cbk0 = cbk0 - ci * pi * cyl_bessel_i0<CmplxType, RealType, IntType>(z);
+    }
+  }
+  return cbk0;
+}
+
+//! Compute modified Bessel function I1(z) of the first kind of order one
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_i1(const CmplxType& z,
+                                               const RealType& joint_val = 25,
+                                               const IntType& bw_start   = 70) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CIKNB and CIK01 in S. Zhang & J. Jin "Computation of Special
+  // Functions" (Wiley, 1996).
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cbi1      --- I1(z)
+  CmplxType cbi1;
+  const RealType pi    = M_PI;
+  const RealType b[12] = {-0.375,
+                          -1.171875e-1,
+                          -1.025390625e-1,
+                          -1.4419555664063e-1,
+                          -2.7757644653320e-1,
+                          -6.7659258842468e-1,
+                          -1.9935317337513,
+                          -6.8839142681099,
+                          -2.7248827311269e1,
+                          -1.2159789187654e2,
+                          -6.0384407670507e2,
+                          -3.3022722944809e3};
+
+  RealType a0  = Kokkos::abs(z);
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cbi1 = CmplxType(0.0, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:25)
+      CmplxType cbs = CmplxType(0.0, 0.0);
+      // CmplxType csk0 = CmplxType(0.0,0.0);
+      CmplxType cf0 = CmplxType(0.0, 0.0);
+      CmplxType cf1 = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 70)
+        cf = 2.0 * (k + 1.0) * cf1 / z1 + cf0;
+        if (k == 1) cbi1 = cf;
+        // if ((k == 2*(k/2)) && (k != 0)) {
+        //  csk0 = csk0+4.0*cf/static_cast<RealType>(k);
+        //}
+        cbs = cbs + 2.0 * cf;
+        cf0 = cf1;
+        cf1 = cf;
+      }
+      cs0  = Kokkos::exp(z1) / (cbs - cf);
+      cbi1 = cbi1 * cs0;
+    } else {  // Using asymptotic expansion (6.2.1) for |z|>joint_val
+              // (default:25)
+      CmplxType ca = Kokkos::exp(z1) / Kokkos::sqrt(2.0 * pi * z1);
+      cbi1         = CmplxType(1.0, 0.0);
+      CmplxType zr = 1.0 / z1;
+      for (int k = 1; k <= 12; k++) {
+        cbi1 = cbi1 + b[k - 1] * Kokkos::pow(zr, 1.0 * k);
+      }
+      cbi1 = ca * cbi1;
+    }
+    if (z.real() < 0.0) {  // Apply (6.4.4)
+      cbi1 = -cbi1;
+    }
+  }
+  return cbi1;
+}
+
+//! Compute modified Bessel function K1(z) of the second kind of order one
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_k1(const CmplxType& z,
+                                               const RealType& joint_val = 9,
+                                               const IntType& bw_start   = 30) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CIKNB and CIK01 in S. Zhang & J. Jin "Computation of Special
+  // Functions" (Wiley, 1996).
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cbk1      --- K1(z)
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::pow;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType cbk0, cbi0, cbk1, cbi1;
+  const RealType pi = M_PI;
+  const RealType el = 0.57721566490153286060651209008240;
+
+  RealType a0  = Kokkos::abs(z);
+  CmplxType ci = CmplxType(0.0, 1.0);
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cbk1 = CmplxType(inf, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:9)
+      CmplxType cbs  = CmplxType(0.0, 0.0);
+      CmplxType csk0 = CmplxType(0.0, 0.0);
+      CmplxType cf0  = CmplxType(0.0, 0.0);
+      CmplxType cf1  = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 30)
+        cf = 2.0 * (k + 1.0) * cf1 / z1 + cf0;
+        if (k == 1) cbi1 = cf;
+        if (k == 0) cbi0 = cf;
+        if ((k == 2 * (k / 2)) && (k != 0)) {
+          csk0 = csk0 + 4.0 * cf / static_cast<RealType>(k);
+        }
+        cbs = cbs + 2.0 * cf;
+        cf0 = cf1;
+        cf1 = cf;
+      }
+      cs0  = Kokkos::exp(z1) / (cbs - cf);
+      cbi0 = cbi0 * cs0;
+      cbi1 = cbi1 * cs0;
+      cbk0 = -(Kokkos::log(0.5 * z1) + el) * cbi0 + cs0 * csk0;
+      cbk1 = (1.0 / z1 - cbi1 * cbk0) / cbi0;
+    } else {  // Using asymptotic expansion (6.2.2) for |z|>joint_val
+              // (default:9)
+      CmplxType ca0  = Kokkos::sqrt(pi / (2.0 * z1)) * Kokkos::exp(-z1);
+      CmplxType cbkl = CmplxType(1.0, 0.0);
+      CmplxType cr   = CmplxType(1.0, 0.0);
+      for (int k = 1; k <= 30; k++) {
+        cr   = 0.125 * cr * (4.0 - pow(2.0 * k - 1.0, 2.0)) / (k * z1);
+        cbkl = cbkl + cr;
+      }
+      cbk1 = ca0 * cbkl;
+    }
+    if (z.real() < 0.0) {  // Apply (6.4.4)
+      if (z.imag() < 0.0)
+        cbk1 = -cbk1 - ci * pi * cyl_bessel_i1<CmplxType, RealType, IntType>(z);
+      if (z.imag() >= 0.0)
+        cbk1 = -cbk1 + ci * pi * cyl_bessel_i1<CmplxType, RealType, IntType>(z);
+    }
+  }
+  return cbk1;
+}
+
+//! Compute Hankel function H10(z) of the first kind of order zero
+//! for a complex argument
+template <class CmplxType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_h10(const CmplxType& z) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CH12N in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  using RealType = typename CmplxType::value_type;
+  using Kokkos::Experimental::infinity;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType ch10, cbk0, cbj0, cby0;
+  const RealType pi = M_PI;
+  CmplxType ci      = CmplxType(0.0, 1.0);
+
+  if ((z.real() == 0.0) && (z.imag() == 0.0)) {
+    ch10 = CmplxType(1.0, -inf);
+  } else if (z.imag() <= 0.0) {
+    cbj0 = cyl_bessel_j0<CmplxType, RealType, int>(z);
+    cby0 = cyl_bessel_y0<CmplxType, RealType, int>(z);
+    ch10 = cbj0 + ci * cby0;
+  } else {  //(z.imag() > 0.0)
+    cbk0 = cyl_bessel_k0<CmplxType, RealType, int>(-ci * z, 18.0, 70);
+    ch10 = 2.0 / (pi * ci) * cbk0;
+  }
+
+  return ch10;
+}
+
+//! Compute Hankel function H11(z) of the first kind of order one
+//! for a complex argument
+template <class CmplxType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_h11(const CmplxType& z) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CH12N in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  using RealType = typename CmplxType::value_type;
+  using Kokkos::Experimental::infinity;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType ch11, cbk1, cbj1, cby1;
+  const RealType pi = M_PI;
+  CmplxType ci      = CmplxType(0.0, 1.0);
+
+  if ((z.real() == 0.0) && (z.imag() == 0.0)) {
+    ch11 = CmplxType(0.0, -inf);
+  } else if (z.imag() <= 0.0) {
+    cbj1 = cyl_bessel_j1<CmplxType, RealType, int>(z);
+    cby1 = cyl_bessel_y1<CmplxType, RealType, int>(z);
+    ch11 = cbj1 + ci * cby1;
+  } else {  //(z.imag() > 0.0)
+    cbk1 = cyl_bessel_k1<CmplxType, RealType, int>(-ci * z, 18.0, 70);
+    ch11 = -2.0 / pi * cbk1;
+  }
+
+  return ch11;
+}
+
+//! Compute Hankel function H20(z) of the second kind of order zero
+//! for a complex argument
+template <class CmplxType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_h20(const CmplxType& z) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CH12N in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  using RealType = typename CmplxType::value_type;
+  using Kokkos::Experimental::infinity;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType ch20, cbk0, cbj0, cby0;
+  const RealType pi = M_PI;
+  CmplxType ci      = CmplxType(0.0, 1.0);
+
+  if ((z.real() == 0.0) && (z.imag() == 0.0)) {
+    ch20 = CmplxType(1.0, inf);
+  } else if (z.imag() >= 0.0) {
+    cbj0 = cyl_bessel_j0<CmplxType, RealType, int>(z);
+    cby0 = cyl_bessel_y0<CmplxType, RealType, int>(z);
+    ch20 = cbj0 - ci * cby0;
+  } else {  //(z.imag() < 0.0)
+    cbk0 = cyl_bessel_k0<CmplxType, RealType, int>(ci * z, 18.0, 70);
+    ch20 = 2.0 / pi * ci * cbk0;
+  }
+
+  return ch20;
+}
+
+//! Compute Hankel function H21(z) of the second kind of order one
+//! for a complex argument
+template <class CmplxType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_h21(const CmplxType& z) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CH12N in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  using RealType = typename CmplxType::value_type;
+  using Kokkos::Experimental::infinity;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType ch21, cbk1, cbj1, cby1;
+  const RealType pi = M_PI;
+  CmplxType ci      = CmplxType(0.0, 1.0);
+
+  if ((z.real() == 0.0) && (z.imag() == 0.0)) {
+    ch21 = CmplxType(0.0, inf);
+  } else if (z.imag() >= 0.0) {
+    cbj1 = cyl_bessel_j1<CmplxType, RealType, int>(z);
+    cby1 = cyl_bessel_y1<CmplxType, RealType, int>(z);
+    ch21 = cbj1 - ci * cby1;
+  } else {  //(z.imag() < 0.0)
+    cbk1 = cyl_bessel_k1<CmplxType, RealType, int>(ci * z, 18.0, 70);
+    ch21 = -2.0 / pi * cbk1;
+  }
+
+  return ch21;
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/packages/kokkos/core/src/Kokkos_MemoryPool.hpp b/packages/kokkos/core/src/Kokkos_MemoryPool.hpp
index 2cafac1aea462ec29fe1d1cb853cb374ea7e8109..c814e5a22a32d31e1047f52ac55438934c8194d3 100644
--- a/packages/kokkos/core/src/Kokkos_MemoryPool.hpp
+++ b/packages/kokkos/core/src/Kokkos_MemoryPool.hpp
@@ -524,7 +524,7 @@ class MemoryPool {
     // Fast query clock register 'tic' to pseudo-randomize
     // the guess for which block within a superblock should
     // be claimed.  If not available then a search occurs.
-#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GEN)
+#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU)
     const uint32_t block_id_hint = alloc_size;
 #else
     const uint32_t block_id_hint =
@@ -585,19 +585,6 @@ class MemoryPool {
               (uint64_t(sb_id) << m_sb_size_lg2)       // superblock memory
               + (uint64_t(result.first) << size_lg2);  // block memory
 
-#if 0
-  printf( "  MemoryPool(0x%lx) pointer(0x%lx) allocate(%lu) sb_id(%d) sb_state(0x%x) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
-        , (uintptr_t)m_sb_state_array
-        , (uintptr_t)p
-        , alloc_size
-        , sb_id
-        , sb_state 
-        , (1u << size_lg2)
-        , (1u << count_lg2)
-        , result.first 
-        , result.second );
-#endif
-
           break;  // Success
         }
       }
@@ -740,7 +727,8 @@ class MemoryPool {
 
     // Determine which superblock and block
     const ptrdiff_t d =
-        ((char *)p) - ((char *)(m_sb_state_array + m_data_offset));
+        static_cast<char *>(p) -
+        reinterpret_cast<char *>(m_sb_state_array + m_data_offset);
 
     // Verify contained within the memory pool's superblocks:
     const int ok_contains =
@@ -772,29 +760,10 @@ class MemoryPool {
         const int result = CB::release(sb_state_array, bit, block_state);
 
         ok_dealloc_once = 0 <= result;
-
-#if 0
-  printf( "  MemoryPool(0x%lx) pointer(0x%lx) deallocate sb_id(%d) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
-        , (uintptr_t)m_sb_state_array
-        , (uintptr_t)p
-        , sb_id
-        , (1u << block_size_lg2)
-        , (1u << (m_sb_size_lg2 - block_size_lg2))
-        , bit
-        , result );
-#endif
       }
     }
 
     if (!ok_contains || !ok_block_aligned || !ok_dealloc_once) {
-#if 0
-  printf( "  MemoryPool(0x%lx) pointer(0x%lx) deallocate ok_contains(%d) ok_block_aligned(%d) ok_dealloc_once(%d)\n"
-        , (uintptr_t)m_sb_state_array
-        , (uintptr_t)p
-        , int(ok_contains)
-        , int(ok_block_aligned)
-        , int(ok_dealloc_once) );
-#endif
       Kokkos::abort("Kokkos MemoryPool::deallocate given erroneous pointer");
     }
   }
diff --git a/packages/kokkos/core/src/Kokkos_MemoryTraits.hpp b/packages/kokkos/core/src/Kokkos_MemoryTraits.hpp
index f23442b793f5eeca8e0c1b22df6468271df96b73..e3cee93e257b154b73df1d0c40514040d7083f22 100644
--- a/packages/kokkos/core/src/Kokkos_MemoryTraits.hpp
+++ b/packages/kokkos/core/src/Kokkos_MemoryTraits.hpp
@@ -46,7 +46,6 @@
 #define KOKKOS_MEMORYTRAITS_HPP
 
 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 //----------------------------------------------------------------------------
 
@@ -119,6 +118,15 @@ enum : unsigned {
   MEMORY_ALIGNMENT_THRESHOLD = KOKKOS_MEMORY_ALIGNMENT_THRESHOLD
 };
 
+// ------------------------------------------------------------------ //
+//  this identifies the default memory trait
+//
+template <typename Tp>
+struct is_default_memory_trait : std::false_type {};
+
+template <>
+struct is_default_memory_trait<Kokkos::MemoryTraits<0>> : std::true_type {};
+
 }  // namespace Impl
 }  // namespace Kokkos
 
diff --git a/packages/kokkos/core/src/Kokkos_NumericTraits.hpp b/packages/kokkos/core/src/Kokkos_NumericTraits.hpp
index b9380cbe02b42a04c5b21b6cb8408016049d15f8..1999d46f3c4087dab1192c350da8ba844199a8fe 100644
--- a/packages/kokkos/core/src/Kokkos_NumericTraits.hpp
+++ b/packages/kokkos/core/src/Kokkos_NumericTraits.hpp
@@ -56,11 +56,11 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 // clang-format off
-template <class> struct infinity_helper;
+template <class> struct infinity_helper {};
 template <> struct infinity_helper<float> { static constexpr float value = HUGE_VALF; };
 template <> struct infinity_helper<double> { static constexpr double value = HUGE_VAL; };
 template <> struct infinity_helper<long double> { static constexpr long double value = HUGE_VALL; };
-template <class> struct finite_min_helper;
+template <class> struct finite_min_helper {};
 template <> struct finite_min_helper<bool> { static constexpr bool value = false; };
 template <> struct finite_min_helper<char> { static constexpr char value = CHAR_MIN; };
 template <> struct finite_min_helper<signed char> { static constexpr signed char value = SCHAR_MIN; };
@@ -76,7 +76,7 @@ template <> struct finite_min_helper<unsigned long long int> { static constexpr
 template <> struct finite_min_helper<float> { static constexpr float value = -FLT_MAX; };
 template <> struct finite_min_helper<double> { static constexpr double value = -DBL_MAX; };
 template <> struct finite_min_helper<long double> { static constexpr long double value = -LDBL_MAX; };
-template <class> struct finite_max_helper;
+template <class> struct finite_max_helper {};
 template <> struct finite_max_helper<bool> { static constexpr bool value = true; };
 template <> struct finite_max_helper<char> { static constexpr char value = CHAR_MAX; };
 template <> struct finite_max_helper<signed char> { static constexpr signed char value = SCHAR_MAX; };
@@ -92,7 +92,7 @@ template <> struct finite_max_helper<unsigned long long int> { static constexpr
 template <> struct finite_max_helper<float> { static constexpr float value = FLT_MAX; };
 template <> struct finite_max_helper<double> { static constexpr double value = DBL_MAX; };
 template <> struct finite_max_helper<long double> { static constexpr long double value = LDBL_MAX; };
-template <class> struct epsilon_helper;
+template <class> struct epsilon_helper {};
 namespace{
   // FIXME workaround for LDL_EPSILON with XL
   template<typename T>
@@ -115,15 +115,15 @@ template <> struct epsilon_helper<long double> {
   static constexpr long double value = LDBL_EPSILON;
 #endif
 };
-template <class> struct round_error_helper;
+template <class> struct round_error_helper {};
 template <> struct round_error_helper<float> { static constexpr float value = 0.5F; };
 template <> struct round_error_helper<double> { static constexpr double value = 0.5; };
 template <> struct round_error_helper<long double> { static constexpr long double value = 0.5L; };
-template <class> struct norm_min_helper;
+template <class> struct norm_min_helper {};
 template <> struct norm_min_helper<float> { static constexpr float value = FLT_MIN; };
 template <> struct norm_min_helper<double> { static constexpr double value = DBL_MIN; };
 template <> struct norm_min_helper<long double> { static constexpr long double value = LDBL_MIN; };
-template <class> struct digits_helper;
+template <class> struct digits_helper {};
 template <> struct digits_helper<bool> { static constexpr int value = 1; };
 template <> struct digits_helper<char> { static constexpr int value = CHAR_BIT - std::is_signed<char>::value; };
 template <> struct digits_helper<signed char> { static constexpr int value = CHAR_BIT - 1; };
@@ -139,11 +139,13 @@ template <> struct digits_helper<unsigned long long int> { static constexpr int
 template <> struct digits_helper<float> { static constexpr int value = FLT_MANT_DIG; };
 template <> struct digits_helper<double> { static constexpr int value = DBL_MANT_DIG; };
 template <> struct digits_helper<long double> { static constexpr int value = LDBL_MANT_DIG; };
-template <class> struct digits10_helper;
+template <class> struct digits10_helper {};
 template <> struct digits10_helper<bool> { static constexpr int value = 0; };
-constexpr double log10_2 = 2.41;
+// The fraction 643/2136 approximates log10(2) to 7 significant digits.
+// Workaround GCC compiler bug with -frounding-math that prevented the
+// floating-point expression to be evaluated at compile time.
 #define DIGITS10_HELPER_INTEGRAL(TYPE) \
-template <> struct digits10_helper<TYPE> { static constexpr int value = digits_helper<TYPE>::value * log10_2; };
+template <> struct digits10_helper<TYPE> { static constexpr int value = digits_helper<TYPE>::value * 643L / 2136; };
 DIGITS10_HELPER_INTEGRAL(char)
 DIGITS10_HELPER_INTEGRAL(signed char)
 DIGITS10_HELPER_INTEGRAL(unsigned char)
@@ -159,15 +161,29 @@ DIGITS10_HELPER_INTEGRAL(unsigned long long int)
 template <> struct digits10_helper<float> { static constexpr int value = FLT_DIG; };
 template <> struct digits10_helper<double> { static constexpr int value = DBL_DIG; };
 template <> struct digits10_helper<long double> { static constexpr int value = LDBL_DIG; };
-template <class> struct max_digits10_helper;
-// FIXME not sure why were not defined in my <cfloat>
-//template <> struct max_digits10_helper<float> { static constexpr int value = FLT_DECIMAL_DIG; };
-//template <> struct max_digits10_helper<double> { static constexpr int value = DBL_DECIMAL_DIG; };
-//template <> struct max_digits10_helper<long double> { static constexpr int value = LDBL_DECIMAL_DIG; };
-template <> struct max_digits10_helper<float> { static constexpr int value = 9; };
-template <> struct max_digits10_helper<double> { static constexpr int value = 17; };
-template <> struct max_digits10_helper<long double> { static constexpr int value = 21; };
-template <class> struct radix_helper;
+template <class> struct max_digits10_helper {};
+// Approximate ceil(digits<T>::value * log10(2) + 1)
+#define MAX_DIGITS10_HELPER(TYPE) \
+template <> struct max_digits10_helper<TYPE> { static constexpr int value = (digits_helper<TYPE>::value * 643L + 2135) / 2136 + 1; };
+#ifdef FLT_DECIMAL_DIG
+template <> struct max_digits10_helper<float> { static constexpr int value = FLT_DECIMAL_DIG; };
+#else
+MAX_DIGITS10_HELPER(float)
+#endif
+#ifdef DBL_DECIMAL_DIG
+template <> struct max_digits10_helper<double> { static constexpr int value = DBL_DECIMAL_DIG; };
+#else
+MAX_DIGITS10_HELPER(double)
+#endif
+#ifdef DECIMAL_DIG
+template <> struct max_digits10_helper<long double> { static constexpr int value = DECIMAL_DIG; };
+#elif LDBL_DECIMAL_DIG
+template <> struct max_digits10_helper<long double> { static constexpr int value = LDBL_DECIMAL_DIG; };
+#else
+MAX_DIGITS10_HELPER(long double)
+#endif
+#undef MAX_DIGITS10_HELPER
+template <class> struct radix_helper {};
 template <> struct radix_helper<bool> { static constexpr int value = 2; };
 template <> struct radix_helper<char> { static constexpr int value = 2; };
 template <> struct radix_helper<signed char> { static constexpr int value = 2; };
@@ -183,19 +199,19 @@ template <> struct radix_helper<unsigned long long int> { static constexpr int v
 template <> struct radix_helper<float> { static constexpr int value = FLT_RADIX; };
 template <> struct radix_helper<double> { static constexpr int value = FLT_RADIX; };
 template <> struct radix_helper<long double> { static constexpr int value = FLT_RADIX; };
-template <class> struct min_exponent_helper;
+template <class> struct min_exponent_helper {};
 template <> struct min_exponent_helper<float> { static constexpr int value = FLT_MIN_EXP; };
 template <> struct min_exponent_helper<double> { static constexpr int value = DBL_MIN_EXP; };
 template <> struct min_exponent_helper<long double> { static constexpr int value = LDBL_MIN_EXP; };
-template <class> struct min_exponent10_helper;
+template <class> struct min_exponent10_helper {};
 template <> struct min_exponent10_helper<float> { static constexpr int value = FLT_MIN_10_EXP; };
 template <> struct min_exponent10_helper<double> { static constexpr int value = DBL_MIN_10_EXP; };
 template <> struct min_exponent10_helper<long double> { static constexpr int value = LDBL_MIN_10_EXP; };
-template <class> struct max_exponent_helper;
+template <class> struct max_exponent_helper {};
 template <> struct max_exponent_helper<float> { static constexpr int value = FLT_MAX_EXP; };
 template <> struct max_exponent_helper<double> { static constexpr int value = DBL_MAX_EXP; };
 template <> struct max_exponent_helper<long double> { static constexpr int value = LDBL_MAX_EXP; };
-template <class> struct max_exponent10_helper;
+template <class> struct max_exponent10_helper{};
 template <> struct max_exponent10_helper<float> { static constexpr int value = FLT_MAX_10_EXP; };
 template <> struct max_exponent10_helper<double> { static constexpr int value = DBL_MAX_10_EXP; };
 template <> struct max_exponent10_helper<long double> { static constexpr int value = LDBL_MAX_10_EXP; };
diff --git a/packages/kokkos/core/src/Kokkos_OpenMP.hpp b/packages/kokkos/core/src/Kokkos_OpenMP.hpp
index eedba38a8456117ac03d8c21e657729673017984..8f12eceb27c46946a83c64eceec0711cca6ef2b7 100644
--- a/packages/kokkos/core/src/Kokkos_OpenMP.hpp
+++ b/packages/kokkos/core/src/Kokkos_OpenMP.hpp
@@ -62,7 +62,6 @@
 #include <Kokkos_Parallel.hpp>
 #include <Kokkos_TaskScheduler.hpp>
 #include <Kokkos_Layout.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
 
@@ -105,9 +104,11 @@ class OpenMP {
   /// \brief Wait until all dispatched functors complete on the given instance
   ///
   ///  This is a no-op on OpenMP
-  static void impl_static_fence(OpenMP const& = OpenMP()) noexcept;
+  static void impl_static_fence(OpenMP const&           = OpenMP(),
+                                const std::string& name = "") noexcept;
 
   void fence() const;
+  void fence(const std::string& name) const;
 
   /// \brief Does the given instance return immediately after launching
   /// a parallel algorithm
@@ -167,7 +168,7 @@ class OpenMP {
   static int impl_get_current_max_threads() noexcept;
 
   static constexpr const char* name() noexcept { return "OpenMP"; }
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept { return 1; }
 };
 
 namespace Tools {
@@ -188,6 +189,7 @@ class OpenMPSpaceInitializer : public ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
diff --git a/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp b/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp
index 2a57a43e63b77b7f60e4cc40bb20272e0332944a..f394f3240832a67f22e4056fa27e33500b38178c 100644
--- a/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp
+++ b/packages/kokkos/core/src/Kokkos_OpenMPTarget.hpp
@@ -56,9 +56,8 @@
 #include <Kokkos_OpenMPTargetSpace.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_Parallel.hpp>
-#include <Kokkos_TaskPolicy.hpp>
+#include <Kokkos_TaskScheduler.hpp>
 #include <Kokkos_Layout.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #include <KokkosExp_MDRangePolicy.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
@@ -92,7 +91,10 @@ class OpenMPTarget {
   inline static bool in_parallel() { return omp_in_parallel(); }
 
   static void fence();
+  static void fence(const std::string&);
 
+  static void impl_static_fence();
+  static void impl_static_fence(const std::string&);
   /** \brief  Return the maximum amount of concurrency.  */
   static int concurrency();
 
@@ -115,7 +117,7 @@ class OpenMPTarget {
   }
 
   OpenMPTarget();
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept;
 
  private:
   Impl::OpenMPTargetInternal* m_space_instance;
@@ -141,6 +143,7 @@ class OpenMPTargetSpaceInitializer : public ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
diff --git a/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp b/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
index 58d723ac110a2bfd2266d4055f9c222c4a2c2c78..c1d338331f56cd59a9eb917a2d8f72ebb06b453b 100644
--- a/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
@@ -89,6 +89,41 @@ namespace Impl {
 }  // namespace Impl
 }  // namespace Kokkos
 
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------
+
+template <>
+struct MemorySpaceAccess<Kokkos::HostSpace,
+                         Kokkos::Experimental::OpenMPTargetSpace> {
+  enum : bool { assignable = false };
+  enum : bool { accessible = false };
+  enum : bool { deepcopy = true };
+};
+
+//----------------------------------------
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                         Kokkos::HostSpace> {
+  enum : bool { assignable = false };
+  enum : bool { accessible = false };
+  enum : bool { deepcopy = true };
+};
+
+//----------------------------------------
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                         Kokkos::Experimental::OpenMPTargetSpace> {
+  enum : bool { assignable = true };
+  enum : bool { accessible = true };
+  enum : bool { deepcopy = false };
+};
+}  // namespace Impl
+}  // namespace Kokkos
+
 namespace Kokkos {
 namespace Experimental {
 
@@ -213,7 +248,10 @@ struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace,
                                        omp_get_default_device()));
   }
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<OpenMPTargetSpace, OpenMPTargetSpace>: fence "
+        "before "
+        "copy");
     if (n > 0)
       OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0,
                                        omp_get_default_device(),
@@ -231,7 +269,9 @@ struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace, HostSpace,
                                        omp_get_initial_device()));
   }
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<OpenMPTargetSpace, HostSpace>: fence before "
+        "copy");
     if (n > 0)
       OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0,
                                        omp_get_default_device(),
@@ -249,7 +289,9 @@ struct DeepCopy<HostSpace, Kokkos::Experimental::OpenMPTargetSpace,
                                        omp_get_default_device()));
   }
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<HostSpace, OpenMPTargetSpace>: fence before "
+        "copy");
     if (n > 0)
       OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0,
                                        omp_get_initial_device(),
diff --git a/packages/kokkos/core/src/Kokkos_Parallel.hpp b/packages/kokkos/core/src/Kokkos_Parallel.hpp
index 85d1dad454ba64aa1311cf19437206768018571b..25ebe26155fed5812e161e14360811cfc660e105 100644
--- a/packages/kokkos/core/src/Kokkos_Parallel.hpp
+++ b/packages/kokkos/core/src/Kokkos_Parallel.hpp
@@ -48,23 +48,19 @@
 #ifndef KOKKOS_PARALLEL_HPP
 #define KOKKOS_PARALLEL_HPP
 
-#include <cstddef>
 #include <Kokkos_Core_fwd.hpp>
-#include <Kokkos_View.hpp>
+#include <Kokkos_DetectionIdiom.hpp>
 #include <Kokkos_ExecPolicy.hpp>
+#include <Kokkos_View.hpp>
 
 #include <impl/Kokkos_Tools.hpp>
-#include <type_traits>
-#include <typeinfo>
-
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_FunctorAnalysis.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 
-#ifdef KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-#include <iostream>
-#endif
+#include <cstddef>
+#include <type_traits>
+#include <typeinfo>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -72,34 +68,11 @@
 namespace Kokkos {
 namespace Impl {
 
-template <class T, class = void>
-struct is_detected_execution_space : std::false_type {
-  using type = not_a_type;
-};
-
 template <class T>
-struct is_detected_execution_space<T, void_t<typename T::execution_space>>
-    : std::true_type {
-  using type = typename T::execution_space;
-};
+using execution_space_t = typename T::execution_space;
 
 template <class T>
-using detected_execution_space_t =
-    typename is_detected_execution_space<T>::type;
-
-template <class T, class = void>
-struct is_detected_device_type : std::false_type {
-  using type = not_a_type;
-};
-
-template <class T>
-struct is_detected_device_type<T, void_t<typename T::device_type>>
-    : std::true_type {
-  using type = typename T::device_type;
-};
-
-template <class T>
-using detected_device_type_t = typename is_detected_device_type<T>::type;
+using device_type_t = typename T::device_type;
 
 //----------------------------------------------------------------------------
 /** \brief  Given a Functor and Execution Policy query an execution space.
@@ -112,16 +85,14 @@ using detected_device_type_t = typename is_detected_device_type<T>::type;
 
 template <class Functor, class Policy>
 struct FunctorPolicyExecutionSpace {
-  using execution_space = std::conditional_t<
-      is_detected_execution_space<Policy>::value,
-      detected_execution_space_t<Policy>,
-      std::conditional_t<
-          is_detected_execution_space<Functor>::value,
-          detected_execution_space_t<Functor>,
+  using execution_space = detected_or_t<
+      detected_or_t<
           std::conditional_t<
-              is_detected_device_type<Functor>::value,
-              detected_execution_space_t<detected_device_type_t<Functor>>,
-              Kokkos::DefaultExecutionSpace>>>;
+              is_detected<device_type_t, Functor>::value,
+              detected_t<execution_space_t, detected_t<device_type_t, Functor>>,
+              Kokkos::DefaultExecutionSpace>,
+          execution_space_t, Functor>,
+      execution_space_t, Policy>;
 };
 
 }  // namespace Impl
@@ -158,8 +129,7 @@ inline void parallel_for(
     const ExecPolicy& policy, const FunctorType& functor,
     const std::string& str = "",
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<ExecPolicy>::value>::type* =
-        nullptr) {
+        Kokkos::is_execution_policy<ExecPolicy>::value>::type* = nullptr) {
   uint64_t kpID = 0;
 
   ExecPolicy inner_policy = policy;
@@ -200,18 +170,7 @@ inline void parallel_for(const size_t work_count, const FunctorType& functor,
 template <class ExecPolicy, class FunctorType>
 inline void parallel_for(const std::string& str, const ExecPolicy& policy,
                          const FunctorType& functor) {
-#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG Start parallel_for kernel: " << str << std::endl;
-#endif
-
   ::Kokkos::parallel_for(policy, functor, str);
-
-#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG End   parallel_for kernel: " << str << std::endl;
-#endif
-  (void)str;
 }
 
 }  // namespace Kokkos
@@ -255,9 +214,12 @@ namespace Kokkos {
 ///   // operator() or join().
 ///   using value_type = PodType;
 ///
-///   void operator () (const ExecPolicy::member_type & i, value_type& update,
-///   const bool final_pass) const; void init (value_type& update) const; void
-///   join (volatile value_type& update, volatile const value_type& input) const
+///   void operator () (const ExecPolicy::member_type & i,
+///                     value_type& update,
+///                     const bool final_pass) const;
+///   void init (value_type& update) const;
+///   void join (volatile value_type& update,
+//               volatile const value_type& input) const
 /// };
 /// \endcode
 ///
@@ -389,8 +351,7 @@ inline void parallel_scan(
     const ExecutionPolicy& policy, const FunctorType& functor,
     const std::string& str = "",
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<ExecutionPolicy>::value>::type* =
-        nullptr) {
+        Kokkos::is_execution_policy<ExecutionPolicy>::value>::type* = nullptr) {
   uint64_t kpID                = 0;
   ExecutionPolicy inner_policy = policy;
   Kokkos::Tools::Impl::begin_parallel_scan(inner_policy, functor, str, kpID);
@@ -430,18 +391,7 @@ inline void parallel_scan(const size_t work_count, const FunctorType& functor,
 template <class ExecutionPolicy, class FunctorType>
 inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy,
                           const FunctorType& functor) {
-#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG Start parallel_scan kernel: " << str << std::endl;
-#endif
-
   ::Kokkos::parallel_scan(policy, functor, str);
-
-#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG End parallel_scan kernel: " << str << std::endl;
-#endif
-  (void)str;
 }
 
 template <class ExecutionPolicy, class FunctorType, class ReturnType>
@@ -449,8 +399,7 @@ inline void parallel_scan(
     const ExecutionPolicy& policy, const FunctorType& functor,
     ReturnType& return_value, const std::string& str = "",
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<ExecutionPolicy>::value>::type* =
-        nullptr) {
+        Kokkos::is_execution_policy<ExecutionPolicy>::value>::type* = nullptr) {
   uint64_t kpID                = 0;
   ExecutionPolicy inner_policy = policy;
   Kokkos::Tools::Impl::begin_parallel_scan(inner_policy, functor, str, kpID);
@@ -464,7 +413,8 @@ inline void parallel_scan(
 
   Kokkos::Tools::Impl::end_parallel_scan(inner_policy, functor, str, kpID);
 
-  policy.space().fence();
+  policy.space().fence(
+      "Kokkos::parallel_scan: fence due to result being a value, not a view");
 }
 
 template <class FunctorType, class ReturnType>
@@ -491,25 +441,15 @@ inline void parallel_scan(const size_t work_count, const FunctorType& functor,
 
   Kokkos::Tools::Impl::end_parallel_scan(execution_policy, functor, str, kpID);
 
-  execution_space().fence();
+  execution_space().fence(
+      "Kokkos::parallel_scan: fence after scan with return value");
 }
 
 template <class ExecutionPolicy, class FunctorType, class ReturnType>
 inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy,
                           const FunctorType& functor,
                           ReturnType& return_value) {
-#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG Start parallel_scan kernel: " << str << std::endl;
-#endif
-
   ::Kokkos::parallel_scan(policy, functor, return_value, str);
-
-#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG End parallel_scan kernel: " << str << std::endl;
-#endif
-  (void)str;
 }
 
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
index 96242f99b0ca678e1ede6f148ae5d90a16127afe..bc613cea62b10a56f888baabbc16ed9258e041dd 100644
--- a/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
+++ b/packages/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
@@ -811,7 +811,7 @@ struct ParallelReducePolicyType;
 template <class PolicyType, class FunctorType>
 struct ParallelReducePolicyType<
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<PolicyType>::value>::type,
+        Kokkos::is_execution_policy<PolicyType>::value>::type,
     PolicyType, FunctorType> {
   using policy_type = PolicyType;
   static PolicyType policy(const PolicyType& policy_) { return policy_; }
@@ -948,9 +948,10 @@ parallel_reduce_needs_fence(ExecutionSpace const&, ViewLike const&) {
 template <class ExecutionSpace, class... Args>
 struct ParallelReduceFence {
   template <class... ArgsDeduced>
-  static void fence(const ExecutionSpace& ex, ArgsDeduced&&... args) {
+  static void fence(const ExecutionSpace& ex, const std::string& name,
+                    ArgsDeduced&&... args) {
     if (Impl::parallel_reduce_needs_fence(ex, (ArgsDeduced &&) args...)) {
-      ex.fence();
+      ex.fence(name);
     }
   }
 };
@@ -974,7 +975,6 @@ struct ParallelReduceFence {
  *    void join( volatile       <podType> & update ,
  *               volatile const <podType> & input ) const ;
  *
- *    using has_final = true_type;
  *    void final( <podType> & update ) const ;
  *  };
  * \endcode
@@ -991,7 +991,6 @@ struct ParallelReduceFence {
  *    void join( volatile       <podType> update[] ,
  *               volatile const <podType> input[] ) const ;
  *
- *    using has_final = true_type;
  *    void final( <podType> update[] ) const ;
  *  };
  * \endcode
@@ -1001,24 +1000,30 @@ struct ParallelReduceFence {
 
 template <class PolicyType, class FunctorType, class ReturnType>
 inline typename std::enable_if<
-    Kokkos::Impl::is_execution_policy<PolicyType>::value>::type
+    Kokkos::is_execution_policy<PolicyType>::value>::type
 parallel_reduce(const std::string& label, const PolicyType& policy,
                 const FunctorType& functor, ReturnType& return_value) {
   Impl::ParallelReduceAdaptor<PolicyType, FunctorType, ReturnType>::execute(
       label, policy, functor, return_value);
-  Impl::ParallelReduceFence<typename PolicyType::execution_space,
-                            ReturnType>::fence(policy.space(), return_value);
+  Impl::ParallelReduceFence<typename PolicyType::execution_space, ReturnType>::
+      fence(
+          policy.space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 template <class PolicyType, class FunctorType, class ReturnType>
 inline typename std::enable_if<
-    Kokkos::Impl::is_execution_policy<PolicyType>::value>::type
+    Kokkos::is_execution_policy<PolicyType>::value>::type
 parallel_reduce(const PolicyType& policy, const FunctorType& functor,
                 ReturnType& return_value) {
   Impl::ParallelReduceAdaptor<PolicyType, FunctorType, ReturnType>::execute(
       "", policy, functor, return_value);
-  Impl::ParallelReduceFence<typename PolicyType::execution_space,
-                            ReturnType>::fence(policy.space(), return_value);
+  Impl::ParallelReduceFence<typename PolicyType::execution_space, ReturnType>::
+      fence(
+          policy.space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 template <class FunctorType, class ReturnType>
@@ -1030,7 +1035,10 @@ inline void parallel_reduce(const size_t& policy, const FunctorType& functor,
   Impl::ParallelReduceAdaptor<policy_type, FunctorType, ReturnType>::execute(
       "", policy_type(0, policy), functor, return_value);
   Impl::ParallelReduceFence<typename policy_type::execution_space, ReturnType>::
-      fence(typename policy_type::execution_space(), return_value);
+      fence(
+          typename policy_type::execution_space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 template <class FunctorType, class ReturnType>
@@ -1043,33 +1051,42 @@ inline void parallel_reduce(const std::string& label, const size_t& policy,
   Impl::ParallelReduceAdaptor<policy_type, FunctorType, ReturnType>::execute(
       label, policy_type(0, policy), functor, return_value);
   Impl::ParallelReduceFence<typename policy_type::execution_space, ReturnType>::
-      fence(typename policy_type::execution_space(), return_value);
+      fence(
+          typename policy_type::execution_space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 // ReturnValue as View or Reducer: take by copy to allow for inline construction
 
 template <class PolicyType, class FunctorType, class ReturnType>
 inline typename std::enable_if<
-    Kokkos::Impl::is_execution_policy<PolicyType>::value>::type
+    Kokkos::is_execution_policy<PolicyType>::value>::type
 parallel_reduce(const std::string& label, const PolicyType& policy,
                 const FunctorType& functor, const ReturnType& return_value) {
   ReturnType return_value_impl = return_value;
   Impl::ParallelReduceAdaptor<PolicyType, FunctorType, ReturnType>::execute(
       label, policy, functor, return_value_impl);
-  Impl::ParallelReduceFence<typename PolicyType::execution_space,
-                            ReturnType>::fence(policy.space(), return_value);
+  Impl::ParallelReduceFence<typename PolicyType::execution_space, ReturnType>::
+      fence(
+          policy.space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 template <class PolicyType, class FunctorType, class ReturnType>
 inline typename std::enable_if<
-    Kokkos::Impl::is_execution_policy<PolicyType>::value>::type
+    Kokkos::is_execution_policy<PolicyType>::value>::type
 parallel_reduce(const PolicyType& policy, const FunctorType& functor,
                 const ReturnType& return_value) {
   ReturnType return_value_impl = return_value;
   Impl::ParallelReduceAdaptor<PolicyType, FunctorType, ReturnType>::execute(
       "", policy, functor, return_value_impl);
-  Impl::ParallelReduceFence<typename PolicyType::execution_space,
-                            ReturnType>::fence(policy.space(), return_value);
+  Impl::ParallelReduceFence<typename PolicyType::execution_space, ReturnType>::
+      fence(
+          policy.space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 template <class FunctorType, class ReturnType>
@@ -1082,7 +1099,10 @@ inline void parallel_reduce(const size_t& policy, const FunctorType& functor,
   Impl::ParallelReduceAdaptor<policy_type, FunctorType, ReturnType>::execute(
       "", policy_type(0, policy), functor, return_value_impl);
   Impl::ParallelReduceFence<typename policy_type::execution_space, ReturnType>::
-      fence(typename policy_type::execution_space(), return_value);
+      fence(
+          typename policy_type::execution_space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 template <class FunctorType, class ReturnType>
@@ -1096,7 +1116,10 @@ inline void parallel_reduce(const std::string& label, const size_t& policy,
   Impl::ParallelReduceAdaptor<policy_type, FunctorType, ReturnType>::execute(
       label, policy_type(0, policy), functor, return_value_impl);
   Impl::ParallelReduceFence<typename policy_type::execution_space, ReturnType>::
-      fence(typename policy_type::execution_space(), return_value);
+      fence(
+          typename policy_type::execution_space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 // No Return Argument
@@ -1106,8 +1129,7 @@ inline void parallel_reduce(
     const std::string& label, const PolicyType& policy,
     const FunctorType& functor,
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<PolicyType>::value>::type* =
-        nullptr) {
+        Kokkos::is_execution_policy<PolicyType>::value>::type* = nullptr) {
   using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>;
   using value_type  = std::conditional_t<(ValueTraits::StaticValueSize != 0),
                                         typename ValueTraits::value_type,
@@ -1131,8 +1153,7 @@ template <class PolicyType, class FunctorType>
 inline void parallel_reduce(
     const PolicyType& policy, const FunctorType& functor,
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<PolicyType>::value>::type* =
-        nullptr) {
+        Kokkos::is_execution_policy<PolicyType>::value>::type* = nullptr) {
   using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>;
   using value_type  = std::conditional_t<(ValueTraits::StaticValueSize != 0),
                                         typename ValueTraits::value_type,
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_b.cpp b/packages/kokkos/core/src/Kokkos_Rank.hpp
similarity index 71%
rename from packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_b.cpp
rename to packages/kokkos/core/src/Kokkos_Rank.hpp
index 5eed2ca0d77b828b2431bfce0fe69c4da457bb95..3603e2860891758e489471683d34438f219f84d5 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_b.cpp
+++ b/packages/kokkos/core/src/Kokkos_Rank.hpp
@@ -42,5 +42,30 @@
 //@HEADER
 */
 
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewAPI_b.hpp>
+#ifndef KOKKOS_KOKKOS_RANK_HPP
+#define KOKKOS_KOKKOS_RANK_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Layout.hpp>  // Iterate
+
+namespace Kokkos {
+
+// Iteration Pattern
+template <unsigned N, Iterate OuterDir = Iterate::Default,
+          Iterate InnerDir = Iterate::Default>
+struct Rank {
+  static_assert(N != 0u, "Kokkos Error: rank 0 undefined");
+  static_assert(N != 1u,
+                "Kokkos Error: rank 1 is not a multi-dimensional range");
+  static_assert(N < 7u, "Kokkos Error: Unsupported rank...");
+
+  using iteration_pattern = Rank<N, OuterDir, InnerDir>;
+
+  static constexpr int rank                = N;
+  static constexpr Iterate outer_direction = OuterDir;
+  static constexpr Iterate inner_direction = InnerDir;
+};
+
+}  // end namespace Kokkos
+
+#endif  // KOKKOS_KOKKOS_RANK_HPP
diff --git a/packages/kokkos/core/src/Kokkos_SYCL.hpp b/packages/kokkos/core/src/Kokkos_SYCL.hpp
index 8ee76b43862fd6c54c42d98e081174f11d5e09e4..02095ff7b3f01f7558d8e154d1ebc49ff46d1241 100644
--- a/packages/kokkos/core/src/Kokkos_SYCL.hpp
+++ b/packages/kokkos/core/src/Kokkos_SYCL.hpp
@@ -83,7 +83,9 @@ class SYCL {
   SYCL();
   explicit SYCL(const sycl::queue&);
 
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept {
+    return m_space_instance->impl_get_instance_id();
+  }
 
   sycl::context sycl_context() const noexcept {
     return m_space_instance->m_queue->get_context();
@@ -110,7 +112,9 @@ class SYCL {
 
   /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */
   static void impl_static_fence();
+  static void impl_static_fence(const std::string&);
   void fence() const;
+  void fence(const std::string&) const;
 
   /// \brief Print configuration information to the given output stream.
   void print_configuration(std::ostream&, const bool detail = false);
@@ -165,6 +169,7 @@ class SYCLSpaceInitializer : public Kokkos::Impl::ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
@@ -181,6 +186,41 @@ struct DeviceTypeTraits<Kokkos::Experimental::SYCL> {
 }  // namespace Experimental
 }  // namespace Tools
 
+namespace Experimental {
+template <class... Args>
+std::vector<SYCL> partition_space(const SYCL& sycl_space, Args...) {
+#ifdef __cpp_fold_expressions
+  static_assert(
+      (... && std::is_arithmetic_v<Args>),
+      "Kokkos Error: partitioning arguments must be integers or floats");
+#endif
+
+  sycl::context context = sycl_space.sycl_context();
+  sycl::default_selector device_selector;
+  std::vector<SYCL> instances;
+  instances.reserve(sizeof...(Args));
+  for (unsigned int i = 0; i < sizeof...(Args); ++i)
+    instances.emplace_back(sycl::queue(context, device_selector));
+  return instances;
+}
+
+template <class T>
+std::vector<SYCL> partition_space(const SYCL& sycl_space,
+                                  std::vector<T>& weights) {
+  static_assert(
+      std::is_arithmetic<T>::value,
+      "Kokkos Error: partitioning arguments must be integers or floats");
+
+  sycl::context context = sycl_space.sycl_context();
+  sycl::default_selector device_selector;
+  std::vector<SYCL> instances;
+  instances.reserve(weights.size());
+  for (unsigned int i = 0; i < weights.size(); ++i)
+    instances.emplace_back(sycl::queue(context, device_selector));
+  return instances;
+}
+}  // namespace Experimental
+
 }  // namespace Kokkos
 
 #endif
diff --git a/packages/kokkos/core/src/Kokkos_SYCL_Space.hpp b/packages/kokkos/core/src/Kokkos_SYCL_Space.hpp
index 392ab0e59a7d01f42342318bb44aa172bcb4f705..15ef11024d53501356d8004c58fc94fbeca80227 100644
--- a/packages/kokkos/core/src/Kokkos_SYCL_Space.hpp
+++ b/packages/kokkos/core/src/Kokkos_SYCL_Space.hpp
@@ -49,12 +49,19 @@
 
 #ifdef KOKKOS_ENABLE_SYCL
 #include <Kokkos_Concepts.hpp>
+#include <Kokkos_HostSpace.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <SYCL/Kokkos_SYCL_Instance.hpp>
 #include <impl/Kokkos_SharedAlloc.hpp>
 #include <impl/Kokkos_Tools.hpp>
 
 namespace Kokkos {
+
+namespace Impl {
+template <typename T>
+struct is_sycl_type_space : public std::false_type {};
+}  // namespace Impl
+
 namespace Experimental {
 
 class SYCLDeviceUSMSpace {
@@ -118,9 +125,54 @@ class SYCLSharedUSMSpace {
  private:
   sycl::queue m_queue;
 };
+
+class SYCLHostUSMSpace {
+ public:
+  using execution_space = HostSpace::execution_space;
+  using memory_space    = SYCLHostUSMSpace;
+  using device_type     = Kokkos::Device<execution_space, memory_space>;
+  using size_type       = Impl::SYCLInternal::size_type;
+
+  SYCLHostUSMSpace();
+  explicit SYCLHostUSMSpace(sycl::queue queue);
+
+  void* allocate(const std::size_t arg_alloc_size) const;
+  void* allocate(const char* arg_label, const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const;
+
+  void deallocate(void* const arg_alloc_ptr,
+                  const std::size_t arg_alloc_size) const;
+  void deallocate(const char* arg_label, void* const arg_alloc_ptr,
+                  const size_t arg_alloc_size,
+                  const size_t arg_logical_size = 0) const;
+
+ private:
+  template <class, class, class, class>
+  friend class LogicalMemorySpace;
+
+ public:
+  static constexpr const char* name() { return "SYCLHostUSM"; };
+
+ private:
+  sycl::queue m_queue;
+};
+
 }  // namespace Experimental
 
 namespace Impl {
+
+template <>
+struct is_sycl_type_space<Kokkos::Experimental::SYCLDeviceUSMSpace>
+    : public std::true_type {};
+
+template <>
+struct is_sycl_type_space<Kokkos::Experimental::SYCLSharedUSMSpace>
+    : public std::true_type {};
+
+template <>
+struct is_sycl_type_space<Kokkos::Experimental::SYCLHostUSMSpace>
+    : public std::true_type {};
+
 static_assert(Kokkos::Impl::MemorySpaceAccess<
                   Kokkos::Experimental::SYCLDeviceUSMSpace,
                   Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
@@ -131,6 +183,11 @@ static_assert(Kokkos::Impl::MemorySpaceAccess<
                   Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
               "");
 
+static_assert(Kokkos::Impl::MemorySpaceAccess<
+                  Kokkos::Experimental::SYCLDeviceUSMSpace,
+                  Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
+              "");
+
 template <>
 struct MemorySpaceAccess<Kokkos::HostSpace,
                          Kokkos::Experimental::SYCLDeviceUSMSpace> {
@@ -148,6 +205,16 @@ struct MemorySpaceAccess<Kokkos::HostSpace,
   enum : bool { deepcopy = true };
 };
 
+template <>
+struct MemorySpaceAccess<Kokkos::HostSpace,
+                         Kokkos::Experimental::SYCLHostUSMSpace> {
+  // HostSpace::execution_space ==
+  // Experimental::SYCLHostUSMSpace::execution_space
+  enum : bool { assignable = true };
+  enum : bool { accessible = true };
+  enum : bool { deepcopy = true };
+};
+
 template <>
 struct MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
                          Kokkos::HostSpace> {
@@ -165,6 +232,18 @@ struct MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
   enum : bool { deepcopy = true };
 };
 
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                         Kokkos::Experimental::SYCLHostUSMSpace> {
+  // Experimental::SYCLDeviceUSMSpace::execution_space !=
+  // Experimental::SYCLHostUSMSpace::execution_space
+  enum : bool { assignable = false };
+  enum : bool {
+    accessible = true
+  };  // Experimental::SYCLDeviceUSMSpace::execution_space
+  enum : bool { deepcopy = true };
+};
+
 //----------------------------------------
 // SYCLSharedUSMSpace::execution_space == SYCL
 // SYCLSharedUSMSpace accessible to both SYCL and Host
@@ -191,17 +270,46 @@ struct MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace,
 };
 
 template <>
-struct MemorySpaceAccess<
-    Kokkos::Experimental::SYCLDeviceUSMSpace,
-    Kokkos::ScratchMemorySpace<Kokkos::Experimental::SYCL>> {
+struct MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace,
+                         Kokkos::Experimental::SYCLHostUSMSpace> {
+  // Experimental::SYCLSharedUSMSpace::execution_space !=
+  // Experimental::SYCLHostUSMSpace::execution_space
   enum : bool { assignable = false };
-  enum : bool { accessible = true };
-  enum : bool { deepcopy = false };
+  enum : bool {
+    accessible = true
+  };  // Experimental::SYCLSharedUSMSpace::execution_space
+  enum : bool { deepcopy = true };
+};
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
+                         Kokkos::HostSpace> {
+  enum : bool { assignable = false };  // Cannot access from SYCL
+  enum : bool {
+    accessible = true
+  };  // Experimental::SYCLHostUSMSpace::execution_space
+  enum : bool { deepcopy = true };
+};
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
+                         Kokkos::Experimental::SYCLDeviceUSMSpace> {
+  enum : bool { assignable = false };  // Cannot access from Host
+  enum : bool { accessible = false };
+  enum : bool { deepcopy = true };
+};
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
+                         Kokkos::Experimental::SYCLSharedUSMSpace> {
+  enum : bool { assignable = false };  // different execution_space
+  enum : bool { accessible = true };   // same accessibility
+  enum : bool { deepcopy = true };
 };
 
 template <>
 struct MemorySpaceAccess<
-    Kokkos::Experimental::SYCLSharedUSMSpace,
+    Kokkos::Experimental::SYCLDeviceUSMSpace,
     Kokkos::ScratchMemorySpace<Kokkos::Experimental::SYCL>> {
   enum : bool { assignable = false };
   enum : bool { accessible = true };
@@ -276,6 +384,37 @@ class SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, void>
       const RecordBase::function_type arg_dealloc = &base_t::deallocate);
 };
 
+template <>
+class SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace, void>
+    : public SharedAllocationRecordCommon<
+          Kokkos::Experimental::SYCLHostUSMSpace> {
+ private:
+  friend class SharedAllocationRecordCommon<
+      Kokkos::Experimental::SYCLHostUSMSpace>;
+  using base_t =
+      SharedAllocationRecordCommon<Kokkos::Experimental::SYCLHostUSMSpace>;
+  using RecordBase = SharedAllocationRecord<void, void>;
+
+  SharedAllocationRecord(const SharedAllocationRecord&) = delete;
+  SharedAllocationRecord(SharedAllocationRecord&&)      = delete;
+  SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
+  SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete;
+
+  static RecordBase s_root_record;
+
+  const Kokkos::Experimental::SYCLHostUSMSpace m_space;
+
+ protected:
+  ~SharedAllocationRecord();
+
+  SharedAllocationRecord() = default;
+
+  SharedAllocationRecord(
+      const Kokkos::Experimental::SYCLHostUSMSpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
+};
+
 }  // namespace Impl
 
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp b/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp
index 2eebf5365e71d2c5cf42c356951ccec9d041fe14..bb740cfb86a966aefd4ac1ab6c9233ab81e0a97d 100644
--- a/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp
+++ b/packages/kokkos/core/src/Kokkos_ScratchSpace.hpp
@@ -148,10 +148,10 @@ class ScratchMemorySpace {
                                             const IntType& size_L0,
                                             void* ptr_L1           = nullptr,
                                             const IntType& size_L1 = 0)
-      : m_iter_L0((char*)ptr_L0),
-        m_iter_L1((char*)ptr_L1),
-        m_end_L0((char*)ptr_L0 + size_L0),
-        m_end_L1((char*)ptr_L1 + size_L1),
+      : m_iter_L0(static_cast<char*>(ptr_L0)),
+        m_iter_L1(static_cast<char*>(ptr_L1)),
+        m_end_L0(static_cast<char*>(ptr_L0) + size_L0),
+        m_end_L1(static_cast<char*>(ptr_L1) + size_L1),
         m_multiplier(1),
         m_offset(0),
         m_default_level(0) {}
diff --git a/packages/kokkos/core/src/Kokkos_Serial.hpp b/packages/kokkos/core/src/Kokkos_Serial.hpp
index 4d5bb2410bfaabf6f752acf55795c9d7ef82016d..9c8ae70721e58bff5a91da3b99e3630ffe0c273a 100644
--- a/packages/kokkos/core/src/Kokkos_Serial.hpp
+++ b/packages/kokkos/core/src/Kokkos_Serial.hpp
@@ -53,6 +53,8 @@
 
 #include <cstddef>
 #include <iosfwd>
+#include <mutex>
+#include <thread>
 #include <Kokkos_Core_fwd.hpp>
 #include <Kokkos_Parallel.hpp>
 #include <Kokkos_TaskScheduler.hpp>
@@ -60,12 +62,12 @@
 #include <Kokkos_HostSpace.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_MemoryTraits.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_HostThreadTeam.hpp>
 #include <impl/Kokkos_FunctorAnalysis.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_Tools.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
+#include <impl/Kokkos_HostSharedPtr.hpp>
 
 #include <KokkosExp_MDRangePolicy.hpp>
 
@@ -73,6 +75,32 @@
 
 namespace Kokkos {
 
+namespace Impl {
+class SerialInternal {
+ public:
+  SerialInternal() = default;
+
+  bool is_initialized();
+
+  void initialize();
+
+  void finalize();
+
+  static SerialInternal& singleton();
+
+  std::mutex m_thread_team_data_mutex;
+
+  // Resize thread team data scratch memory
+  void resize_thread_team_data(size_t pool_reduce_bytes,
+                               size_t team_reduce_bytes,
+                               size_t team_shared_bytes,
+                               size_t thread_local_bytes);
+
+  HostThreadTeamData m_thread_team_data;
+  bool m_is_initialized = false;
+};
+}  // namespace Impl
+
 /// \class Serial
 /// \brief Kokkos device for non-parallel execution
 ///
@@ -107,6 +135,8 @@ class Serial {
 
   //@}
 
+  Serial();
+
   /// \brief True if and only if this method is being called in a
   ///   thread-parallel function.
   ///
@@ -121,9 +151,26 @@ class Serial {
   /// return asynchronously, before the functor completes.  This
   /// method does not return until all dispatched functors on this
   /// device have completed.
-  static void impl_static_fence() {}
+  static void impl_static_fence() {
+    impl_static_fence(
+        "Kokkos::Serial::impl_static_fence: Unnamed Static Fence");
+  }
+  static void impl_static_fence(const std::string& name) {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Serial>(
+        name,
+        Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+            GlobalDeviceSynchronization,
+        []() {});  // TODO: correct device ID
+    Kokkos::memory_fence();
+  }
 
-  void fence() const {}
+  void fence() const { fence("Kokkos::Serial::fence: Unnamed Instance Fence"); }
+  void fence(const std::string& name) const {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Serial>(
+        name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1},
+        []() {});  // TODO: correct device ID
+    Kokkos::memory_fence();
+  }
 
   /** \brief  Return the maximum amount of concurrency.  */
   static int concurrency() { return 1; }
@@ -153,9 +200,24 @@ class Serial {
     return impl_thread_pool_size(0);
   }
 
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept { return 1; }
 
   static const char* name();
+
+  Impl::SerialInternal* impl_internal_space_instance() const {
+#ifdef KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS
+    return m_space_instance;
+#else
+    return m_space_instance.get();
+#endif
+  }
+
+ private:
+#ifdef KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS
+  Impl::SerialInternal* m_space_instance;
+#else
+  Kokkos::Impl::HostSharedPtr<Impl::SerialInternal> m_space_instance;
+#endif
   //--------------------------------------------------------------------------
 };
 
@@ -177,6 +239,7 @@ class SerialSpaceInitializer : public ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
@@ -206,20 +269,6 @@ struct MemorySpaceAccess<Kokkos::Serial::memory_space,
 namespace Kokkos {
 namespace Impl {
 
-// Resize thread team data scratch memory
-void serial_resize_thread_team_data(size_t pool_reduce_bytes,
-                                    size_t team_reduce_bytes,
-                                    size_t team_shared_bytes,
-                                    size_t thread_local_bytes);
-
-HostThreadTeamData* serial_get_thread_team_data();
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-namespace Kokkos {
-namespace Impl {
-
 /*
  * < Kokkos::Serial , WorkArgTag >
  * < WorkArgTag , Impl::enable_if< std::is_same< Kokkos::Serial ,
@@ -510,13 +559,19 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     const size_t team_shared_size  = 0;  // Never shrinks
     const size_t thread_local_size = 0;  // Never shrinks
 
-    serial_resize_thread_team_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
-
-    HostThreadTeamData& data = *serial_get_thread_team_data();
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    // Need to lock resize_thread_team_data
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
 
     pointer_type ptr =
-        m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
+        m_result_ptr
+            ? m_result_ptr
+            : pointer_type(
+                  internal_instance->m_thread_team_data.pool_reduce_local());
 
     reference_type update =
         ValueInit::init(ReducerConditional::select(m_functor, m_reducer), ptr);
@@ -606,13 +661,18 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
     const size_t team_shared_size  = 0;  // Never shrinks
     const size_t thread_local_size = 0;  // Never shrinks
 
-    serial_resize_thread_team_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
+    // Need to lock resize_thread_team_data
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
 
-    HostThreadTeamData& data = *serial_get_thread_team_data();
-
-    reference_type update =
-        ValueInit::init(m_functor, pointer_type(data.pool_reduce_local()));
+    reference_type update = ValueInit::init(
+        m_functor,
+        pointer_type(
+            internal_instance->m_thread_team_data.pool_reduce_local()));
 
     this->template exec<WorkTag>(update);
   }
@@ -667,13 +727,18 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
     const size_t team_shared_size  = 0;  // Never shrinks
     const size_t thread_local_size = 0;  // Never shrinks
 
-    serial_resize_thread_team_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
+    // Need to lock resize_thread_team_data
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
 
-    HostThreadTeamData& data = *serial_get_thread_team_data();
-
-    reference_type update =
-        ValueInit::init(m_functor, pointer_type(data.pool_reduce_local()));
+    reference_type update = ValueInit::init(
+        m_functor,
+        pointer_type(
+            internal_instance->m_thread_team_data.pool_reduce_local()));
 
     this->template exec<WorkTag>(update);
 
@@ -797,13 +862,19 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     const size_t team_shared_size  = 0;  // Never shrinks
     const size_t thread_local_size = 0;  // Never shrinks
 
-    serial_resize_thread_team_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
-
-    HostThreadTeamData& data = *serial_get_thread_team_data();
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    // Need to lock resize_thread_team_data
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
 
     pointer_type ptr =
-        m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
+        m_result_ptr
+            ? m_result_ptr
+            : pointer_type(
+                  internal_instance->m_thread_team_data.pool_reduce_local());
 
     reference_type update =
         ValueInit::init(ReducerConditional::select(m_functor, m_reducer), ptr);
@@ -869,6 +940,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   using Member = typename Policy::member_type;
 
   const FunctorType m_functor;
+  const Policy m_policy;
   const int m_league;
   const int m_shared;
 
@@ -896,16 +968,21 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     const size_t team_shared_size  = m_shared;
     const size_t thread_local_size = 0;  // Never shrinks
 
-    serial_resize_thread_team_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
-
-    HostThreadTeamData& data = *serial_get_thread_team_data();
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    // Need to lock resize_thread_team_data
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
 
-    this->template exec<typename Policy::work_tag>(data);
+    this->template exec<typename Policy::work_tag>(
+        internal_instance->m_thread_team_data);
   }
 
   ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
       : m_functor(arg_functor),
+        m_policy(arg_policy),
         m_league(arg_policy.league_size()),
         m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
                  FunctorTeamShmemSize<FunctorType>::value(arg_functor, 1)) {}
@@ -941,6 +1018,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   using reference_type = typename Analysis::reference_type;
 
   const FunctorType m_functor;
+  const Policy m_policy;
   const int m_league;
   const ReducerType m_reducer;
   pointer_type m_result_ptr;
@@ -973,18 +1051,24 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     const size_t team_shared_size  = m_shared;
     const size_t thread_local_size = 0;  // Never shrinks
 
-    serial_resize_thread_team_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
-
-    HostThreadTeamData& data = *serial_get_thread_team_data();
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    // Need to lock resize_thread_team_data
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
 
     pointer_type ptr =
-        m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
+        m_result_ptr
+            ? m_result_ptr
+            : pointer_type(
+                  internal_instance->m_thread_team_data.pool_reduce_local());
 
     reference_type update =
         ValueInit::init(ReducerConditional::select(m_functor, m_reducer), ptr);
 
-    this->template exec<WorkTag>(data, update);
+    this->template exec<WorkTag>(internal_instance->m_thread_team_data, update);
 
     Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
         ReducerConditional::select(m_functor, m_reducer), ptr);
@@ -998,6 +1082,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                                   !Kokkos::is_reducer_type<ReducerType>::value,
                               void*>::type = nullptr)
       : m_functor(arg_functor),
+        m_policy(arg_policy),
         m_league(arg_policy.league_size()),
         m_reducer(InvalidType()),
         m_result_ptr(arg_result.data()),
@@ -1016,6 +1101,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
                         const ReducerType& reducer)
       : m_functor(arg_functor),
+        m_policy(arg_policy),
         m_league(arg_policy.league_size()),
         m_reducer(reducer),
         m_result_ptr(reducer.view().data()),
diff --git a/packages/kokkos/core/src/Kokkos_TaskPolicy.hpp b/packages/kokkos/core/src/Kokkos_TaskPolicy.hpp
index 91e079a0e78e314cdb4b22a42876564f25143a4c..9751fab460d4495b1683a94fc0e9d7f879c9a412 100644
--- a/packages/kokkos/core/src/Kokkos_TaskPolicy.hpp
+++ b/packages/kokkos/core/src/Kokkos_TaskPolicy.hpp
@@ -43,5 +43,9 @@
 */
 
 // For backward compatibility:
+#include <Kokkos_Macros.hpp>
+
+KOKKOS_IMPL_WARNING(
+    "This file is deprecated. Use <Kokkos_TaskScheduler.hpp> instead.")
 
 #include <Kokkos_TaskScheduler.hpp>
diff --git a/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp b/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp
index 743273670c9b5fa77f6d590596eb27fc7204396a..17e78f5e81fe83c940eea1bcd6c1d6c347649a4b 100644
--- a/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp
+++ b/packages/kokkos/core/src/Kokkos_TaskScheduler.hpp
@@ -55,7 +55,6 @@
 //----------------------------------------------------------------------------
 
 #include <Kokkos_MemoryPool.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 #include <Kokkos_Future.hpp>
 #include <impl/Kokkos_TaskQueue.hpp>
@@ -372,7 +371,10 @@ class BasicTaskScheduler : public Impl::TaskSchedulerBase {
         task_base* const t = arg[i].m_task;
         if (nullptr != t) {
           // Increment reference count to track subsequent assignment.
-          Kokkos::atomic_increment(&(t->m_ref_count));
+          // This likely has to be SeqCst
+          Kokkos::Impl::desul_atomic_inc(&(t->m_ref_count),
+                                         Kokkos::Impl::MemoryOrderSeqCst(),
+                                         Kokkos::Impl::MemoryScopeDevice());
           if (q != static_cast<queue_type const*>(t->m_queue)) {
             Kokkos::abort(
                 "Kokkos when_all Futures must be in the same scheduler");
@@ -467,7 +469,10 @@ class BasicTaskScheduler : public Impl::TaskSchedulerBase {
           //  scheduler" );
           //}
           // Increment reference count to track subsequent assignment.
-          Kokkos::atomic_increment(&(arg_f.m_task->m_ref_count));
+          // This increment likely has to be SeqCst
+          Kokkos::Impl::desul_atomic_inc(&(arg_f.m_task->m_ref_count),
+                                         Kokkos::Impl::MemoryOrderSeqCst(),
+                                         Kokkos::Impl::MemoryScopeDevice());
           dep[i] = arg_f.m_task;
         }
       }
diff --git a/packages/kokkos/core/src/Kokkos_Threads.hpp b/packages/kokkos/core/src/Kokkos_Threads.hpp
index e827c2a2a1abd46999360c1eef57eb85428436aa..da9bea9c2347faca7b8e5944cb08a0783983f285 100644
--- a/packages/kokkos/core/src/Kokkos_Threads.hpp
+++ b/packages/kokkos/core/src/Kokkos_Threads.hpp
@@ -57,7 +57,6 @@
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_MemoryTraits.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
 
 /*--------------------------------------------------------------------------*/
@@ -65,6 +64,7 @@
 namespace Kokkos {
 namespace Impl {
 class ThreadsExec;
+enum class fence_is_static { yes, no };
 }  // namespace Impl
 }  // namespace Kokkos
 
@@ -108,8 +108,10 @@ class Threads {
   /// method does not return until all dispatched functors on this
   /// device have completed.
   static void impl_static_fence();
+  static void impl_static_fence(const std::string& name);
 
   void fence() const;
+  void fence(const std::string&) const;
 
   /** \brief  Return the maximum amount of concurrency.  */
   static int concurrency();
@@ -167,7 +169,7 @@ class Threads {
     return impl_thread_pool_rank();
   }
 
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept { return 1; }
 
   static const char* name();
   //@}
@@ -192,6 +194,7 @@ class ThreadsSpaceInitializer : public ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
diff --git a/packages/kokkos/core/src/Kokkos_Tuners.hpp b/packages/kokkos/core/src/Kokkos_Tuners.hpp
index f7cc34cc114d29cbe5612bf4350fe01a498282c3..52edd82052f4cfb3919b4733e4acb167780eaf8e 100644
--- a/packages/kokkos/core/src/Kokkos_Tuners.hpp
+++ b/packages/kokkos/core/src/Kokkos_Tuners.hpp
@@ -306,7 +306,11 @@ class MultidimensionalSparseTuningProblem {
   static constexpr size_t max_space_dimension_size = MaxDimensionSize;
   static constexpr double tuning_min               = 0.0;
   static constexpr double tuning_max               = 0.999;
-  static constexpr double tuning_step = tuning_max / max_space_dimension_size;
+
+  // Not declared as static constexpr to work around the following compiler bug
+  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96862
+  // where a floating-point expression cannot be constexpr under -frounding-math
+  double tuning_step = tuning_max / max_space_dimension_size;
 
   using StoredProblemSpace =
       typename Impl::MapTypeConverter<ProblemSpaceInput>::type;
@@ -315,17 +319,45 @@ class MultidimensionalSparseTuningProblem {
 
   using ValueArray = std::array<Kokkos::Tools::Experimental::VariableValue,
                                 space_dimensionality>;
+  template <class Key, class Value>
+  using extended_map = std::map<Key, Value>;
+  template <typename Key>
+  using extended_problem =
+      MultidimensionalSparseTuningProblem<extended_map, MaxDimensionSize, Key,
+                                          ProblemSpaceInput>;
+  template <typename Key, typename Value>
+  using ExtendedProblemSpace =
+      typename Impl::MapTypeConverter<extended_map<Key, Value>>::type;
+
+  template <typename Key>
+  auto extend(const std::string& axis_name,
+              const std::vector<Key>& new_tuning_axis) const
+      -> extended_problem<Key> {
+    ExtendedProblemSpace<Key, ProblemSpaceInput> extended_space;
+    for (auto& key : new_tuning_axis) {
+      extended_space.add_root_value(key);
+      extended_space.add_sub_container(m_space);
+    }
+    std::vector<std::string> extended_names;
+    extended_names.reserve(m_variable_names.size() + 1);
+    extended_names.push_back(axis_name);
+    extended_names.insert(extended_names.end(), m_variable_names.begin(),
+                          m_variable_names.end());
+    return extended_problem<Key>(extended_space, extended_names);
+  }
 
  private:
   StoredProblemSpace m_space;
   std::array<size_t, space_dimensionality> variable_ids;
+  std::vector<std::string> m_variable_names;
   size_t context;
 
  public:
   MultidimensionalSparseTuningProblem() = default;
-  MultidimensionalSparseTuningProblem(ProblemSpaceInput space,
+
+  MultidimensionalSparseTuningProblem(StoredProblemSpace space,
                                       const std::vector<std::string>& names)
-      : m_space(HierarchyConstructor::build(space)) {
+      : m_space(std::move(space)), m_variable_names(names) {
     assert(names.size() == space_dimensionality);
     for (unsigned long x = 0; x < names.size(); ++x) {
       VariableInfo info;
@@ -340,6 +372,20 @@ class MultidimensionalSparseTuningProblem {
     }
   }
 
+  MultidimensionalSparseTuningProblem(ProblemSpaceInput space,
+                                      const std::vector<std::string>& names)
+      : MultidimensionalSparseTuningProblem(HierarchyConstructor::build(space),
+                                            names) {}
+
+  template <typename... Coordinates>
+  auto get_point(Coordinates... coordinates) {
+    using ArrayType = std::array<Kokkos::Tools::Experimental::VariableValue,
+                                 sizeof...(coordinates)>;
+    return Impl::get_point(
+        m_space, ArrayType({Kokkos::Tools::Experimental::make_variable_value(
+                     0, static_cast<double>(coordinates))...}));
+  }
+
   auto begin() {
     context = Kokkos::Tools::Experimental::get_new_context_id();
     ValueArray values;
@@ -349,12 +395,28 @@ class MultidimensionalSparseTuningProblem {
     }
     begin_context(context);
     request_output_values(context, space_dimensionality, values.data());
-    return get_point(m_space, values);
+    return Impl::get_point(m_space, values);
   }
 
   auto end() { end_context(context); }
 };
 
+template <typename Tuner>
+struct ExtendableTunerMixin {
+  template <typename Key>
+  auto combine(const std::string& axis_name,
+               const std::vector<Key>& new_axis) const {
+    const auto& sub_tuner = static_cast<const Tuner*>(this)->get_tuner();
+    return sub_tuner.extend(axis_name, new_axis);
+  }
+
+  template <typename... Coordinates>
+  auto get_point(Coordinates... coordinates) {
+    const auto& sub_tuner = static_cast<const Tuner*>(this)->get_tuner();
+    return sub_tuner.get_point(coordinates...);
+  }
+};
+
 template <size_t MaxDimensionSize = 100, template <class...> class Container,
           class... TemplateArguments>
 auto make_multidimensional_sparse_tuning_problem(
@@ -362,7 +424,8 @@ auto make_multidimensional_sparse_tuning_problem(
   return MultidimensionalSparseTuningProblem<Container, MaxDimensionSize,
                                              TemplateArguments...>(in, names);
 }
-class TeamSizeTuner {
+
+class TeamSizeTuner : public ExtendableTunerMixin<TeamSizeTuner> {
  private:
   using SpaceDescription = std::map<int64_t, std::vector<int64_t>>;
   using TunerType = decltype(make_multidimensional_sparse_tuning_problem<20>(
@@ -481,7 +544,7 @@ class TeamSizeTuner {
     }
   }
 
- private:
+  TunerType get_tuner() const { return tuner; }
 };
 
 namespace Impl {
@@ -501,7 +564,7 @@ void fill_tile(std::map<T, Mapped>& cont, int tile_size) {
 }  // namespace Impl
 
 template <int MDRangeRank>
-struct MDRangeTuner {
+struct MDRangeTuner : public ExtendableTunerMixin<MDRangeTuner<MDRangeRank>> {
  private:
   static constexpr int rank       = MDRangeRank;
   static constexpr int max_slices = 15;
@@ -548,8 +611,45 @@ struct MDRangeTuner {
       tuner.end();
     }
   }
+
+  TunerType get_tuner() const { return tuner; }
 };
 
+template <class Choice>
+struct CategoricalTuner {
+  using choice_list = std::vector<Choice>;
+  choice_list choices;
+  size_t context;
+  size_t tuning_variable_id;
+  CategoricalTuner(std::string name, choice_list m_choices)
+      : choices(m_choices) {
+    std::vector<int64_t> indices;
+    for (typename decltype(choices)::size_type x = 0; x < choices.size(); ++x) {
+      indices.push_back(x);
+    }
+    VariableInfo info;
+    info.category      = StatisticalCategory::kokkos_value_categorical;
+    info.valueQuantity = CandidateValueType::kokkos_value_set;
+    info.type          = ValueType::kokkos_value_int64;
+    info.candidates    = make_candidate_set(indices.size(), indices.data());
+    tuning_variable_id = declare_output_type(name, info);
+  }
+  const Choice& begin() {
+    context = get_new_context_id();
+    begin_context(context);
+    VariableValue value = make_variable_value(tuning_variable_id, int64_t(0));
+    request_output_values(context, 1, &value);
+    return choices[value.value.int_value];
+  }
+  void end() { end_context(context); }
+};
+
+template <typename Choice>
+auto make_categorical_tuner(std::string name, std::vector<Choice> choices)
+    -> CategoricalTuner<Choice> {
+  return CategoricalTuner<Choice>(name, choices);
+}
+
 }  // namespace Experimental
 }  // namespace Tools
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/Kokkos_View.hpp b/packages/kokkos/core/src/Kokkos_View.hpp
index 1abe0a48df5eab32f01ef703e6d39921eb9c70c3..b217cc4bc171a94ce3fdeea9ce6301c70c106da9 100644
--- a/packages/kokkos/core/src/Kokkos_View.hpp
+++ b/packages/kokkos/core/src/Kokkos_View.hpp
@@ -190,9 +190,9 @@ struct ViewTraits<void, void, Prop...> {
 };
 
 template <class ArrayLayout, class... Prop>
-struct ViewTraits<typename std::enable_if<
-                      Kokkos::Impl::is_array_layout<ArrayLayout>::value>::type,
-                  ArrayLayout, Prop...> {
+struct ViewTraits<
+    typename std::enable_if<Kokkos::is_array_layout<ArrayLayout>::value>::type,
+    ArrayLayout, Prop...> {
   // Specify layout, keep subsequent space and memory traits arguments
 
   using execution_space = typename ViewTraits<void, Prop...>::execution_space;
@@ -204,9 +204,8 @@ struct ViewTraits<typename std::enable_if<
 };
 
 template <class Space, class... Prop>
-struct ViewTraits<
-    typename std::enable_if<Kokkos::Impl::is_space<Space>::value>::type, Space,
-    Prop...> {
+struct ViewTraits<typename std::enable_if<Kokkos::is_space<Space>::value>::type,
+                  Space, Prop...> {
   // Specify Space, memory traits should be the only subsequent argument.
 
   static_assert(
@@ -230,8 +229,8 @@ struct ViewTraits<
 };
 
 template <class MemoryTraits, class... Prop>
-struct ViewTraits<typename std::enable_if<Kokkos::Impl::is_memory_traits<
-                      MemoryTraits>::value>::type,
+struct ViewTraits<typename std::enable_if<
+                      Kokkos::is_memory_traits<MemoryTraits>::value>::type,
                   MemoryTraits, Prop...> {
   // Specify memory trait, should not be any subsequent arguments
 
@@ -1543,7 +1542,8 @@ class View : public ViewTraits<DataType, Properties...> {
     // to avoid incomplete type errors from using Kokkos::Cuda directly.
     if (std::is_same<Kokkos::CudaUVMSpace,
                      typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::View<...>::View: fence before allocating UVM");
     }
 #endif
     //------------------------------------------------------------
@@ -1555,7 +1555,8 @@ class View : public ViewTraits<DataType, Properties...> {
 #if defined(KOKKOS_ENABLE_CUDA)
     if (std::is_same<Kokkos::CudaUVMSpace,
                      typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::View<...>::View: fence after allocating UVM");
     }
 #endif
     //------------------------------------------------------------
diff --git a/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
index bdc8993c398f2dd6d6b581008d1f0c8d3535d860..dbb557c13743fa79235ba3786a367b1ab2ac7adc 100644
--- a/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
+++ b/packages/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
@@ -213,7 +213,9 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
       using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
       const closure_type closure(*this, policy_type(0, m_queue.size()));
       closure.execute();
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::WorkGraphPolicy::WorkGraphPolicy: fence after executing "
+          "graph init");
     }
 
     {  // execute-after counts
@@ -221,7 +223,9 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
       using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
       const closure_type closure(*this, policy_type(0, m_graph.entries.size()));
       closure.execute();
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::WorkGraphPolicy::WorkGraphPolicy: fence after executing "
+          "graph count");
     }
 
     {  // Scheduling ready tasks
@@ -229,7 +233,9 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
       using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
       const closure_type closure(*this, policy_type(0, m_graph.numRows()));
       closure.execute();
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::WorkGraphPolicy::WorkGraphPolicy: fence after executing "
+          "readied graph");
     }
   }
 };
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
index e530612a57f81dace23777bdf98670dd73a9d026..0d521479eef89121d30c33a41cebdfac4628646f 100644
--- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
@@ -447,7 +447,13 @@ OpenMP OpenMP::create_instance(...) { return OpenMP(); }
 
 int OpenMP::concurrency() { return Impl::g_openmp_hardware_max_threads; }
 
-void OpenMP::fence() const {}
+void OpenMP::fence() const {
+  fence("Kokkos::OpenMP::fence: Unnamed Instance Fence");
+}
+void OpenMP::fence(const std::string &name) const {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::OpenMP>(
+      name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, []() {});
+}
 
 namespace Impl {
 
@@ -474,6 +480,9 @@ void OpenMPSpaceInitializer::finalize(const bool) {
 }
 
 void OpenMPSpaceInitializer::fence() { Kokkos::OpenMP::impl_static_fence(); }
+void OpenMPSpaceInitializer::fence(const std::string &name) {
+  Kokkos::OpenMP::impl_static_fence(OpenMP(), name);
+}
 
 void OpenMPSpaceInitializer::print_configuration(std::ostream &msg,
                                                  const bool detail) {
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
index 82f049ed136119c28b4add24f1460831fec55b16..1191e49cbe6ecd8d03069288062c88e38e6b8830 100644
--- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
@@ -151,7 +151,14 @@ int OpenMP::impl_thread_pool_rank() noexcept {
 #endif
 }
 
-inline void OpenMP::impl_static_fence(OpenMP const& /*instance*/) noexcept {}
+inline void OpenMP::impl_static_fence(OpenMP const& /**instance*/,
+                                      const std::string& name) noexcept {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::OpenMP>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      []() {});
+}
 
 inline bool OpenMP::is_asynchronous(OpenMP const& /*instance*/) noexcept {
   return false;
@@ -213,8 +220,9 @@ void OpenMP::partition_master(F const& f, int num_partitions,
 
 namespace Experimental {
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 template <>
-class MasterLock<OpenMP> {
+class KOKKOS_DEPRECATED MasterLock<OpenMP> {
  public:
   void lock() { omp_set_lock(&m_lock); }
   void unlock() { omp_unset_lock(&m_lock); }
@@ -231,6 +239,7 @@ class MasterLock<OpenMP> {
  private:
   omp_lock_t m_lock;
 };
+#endif
 
 template <>
 class UniqueToken<OpenMP, UniqueTokenScope::Instance> {
diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
index 2a4a7b1d53bd4785f26508fbc990148291bd9763..d9234e34191a39a84b34d9651975b09fa5801a35 100644
--- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
+++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
@@ -324,11 +324,6 @@ class TaskQueueSpecializationConstrained<
                 // count of 0 also. Otherwise, returns a task from another queue
                 // or `end` if one couldn't be popped
                 task = team_queue.attempt_to_steal_task();
-#if 0
-                if(task != no_more_tasks_sentinel && task != end) {
-                  std::printf("task stolen on rank %d\n", team_exec.league_rank());
-                }
-#endif
               }
 
               // If still tasks are still executing
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
index f13875b440b63b729a64615a20da0f597a85cf6e..7ff885ed86b94e942cce18ff2e7ff1ed664129fa 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
@@ -77,9 +77,10 @@ namespace Kokkos {
 namespace Impl {
 
 void OpenMPTargetExec::verify_is_process(const char* const label) {
-  if (omp_in_parallel()) {
+  // Fails if the current task is in a parallel region or is not on the host.
+  if (omp_in_parallel() && (!omp_is_initial_device())) {
     std::string msg(label);
-    msg.append(" ERROR: in parallel");
+    msg.append(" ERROR: in parallel or on device");
     Kokkos::Impl::throw_runtime_exception(msg);
   }
 }
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
index 0b65e0d4a4b2270fdf577b4fffc1a10835467a47..ccfc756213695df6479634a67405d7d89308dbb8 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
@@ -54,7 +54,10 @@
 // FIXME_OPENMPTARGET - Using this macro to implement a workaround for
 // hierarchical reducers. It avoids hitting the code path which we wanted to
 // write but doesn't work. undef'ed at the end.
+// Intel compilers prefer the non-workaround version.
+#ifndef KOKKOS_ARCH_INTEL_GPU
 #define KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
+#endif
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -66,10 +69,6 @@ template <class Reducer>
 struct OpenMPTargetReducerWrapper {
   using value_type = typename Reducer::value_type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   KOKKOS_INLINE_FUNCTION
   static void join(value_type&, const value_type&) {
     printf(
@@ -90,7 +89,6 @@ struct OpenMPTargetReducerWrapper {
         "Using a generic unknown Reducer for the OpenMPTarget backend is not "
         "implemented.");
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -99,10 +97,6 @@ struct OpenMPTargetReducerWrapper<Sum<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) { dest += src; }
@@ -116,7 +110,6 @@ struct OpenMPTargetReducerWrapper<Sum<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::sum();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -125,10 +118,6 @@ struct OpenMPTargetReducerWrapper<Prod<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) { dest *= src; }
@@ -142,7 +131,6 @@ struct OpenMPTargetReducerWrapper<Prod<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::prod();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -151,10 +139,6 @@ struct OpenMPTargetReducerWrapper<Min<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -170,7 +154,6 @@ struct OpenMPTargetReducerWrapper<Min<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::min();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -179,10 +162,6 @@ struct OpenMPTargetReducerWrapper<Max<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -199,7 +178,6 @@ struct OpenMPTargetReducerWrapper<Max<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::max();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -208,10 +186,6 @@ struct OpenMPTargetReducerWrapper<LAnd<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
     dest = dest && src;
@@ -226,7 +200,6 @@ struct OpenMPTargetReducerWrapper<LAnd<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::land();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -237,10 +210,6 @@ struct OpenMPTargetReducerWrapper<LOr<Scalar, Space>> {
 
   using result_view_type = Kokkos::View<value_type, Space>;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -256,7 +225,6 @@ struct OpenMPTargetReducerWrapper<LOr<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::lor();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -265,10 +233,6 @@ struct OpenMPTargetReducerWrapper<BAnd<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -284,7 +248,6 @@ struct OpenMPTargetReducerWrapper<BAnd<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::band();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -293,10 +256,6 @@ struct OpenMPTargetReducerWrapper<BOr<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -312,7 +271,6 @@ struct OpenMPTargetReducerWrapper<BOr<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::bor();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Index, class Space>
@@ -325,10 +283,6 @@ struct OpenMPTargetReducerWrapper<MinLoc<Scalar, Index, Space>> {
   // Required
   using value_type = ValLocScalar<scalar_type, index_type>;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -345,7 +299,6 @@ struct OpenMPTargetReducerWrapper<MinLoc<Scalar, Index, Space>> {
     val.val = reduction_identity<scalar_type>::min();
     val.loc = reduction_identity<index_type>::min();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Index, class Space>
@@ -358,10 +311,6 @@ struct OpenMPTargetReducerWrapper<MaxLoc<Scalar, Index, Space>> {
   // Required
   using value_type = ValLocScalar<scalar_type, index_type>;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
     if (src.val > dest.val) dest = src;
@@ -377,7 +326,6 @@ struct OpenMPTargetReducerWrapper<MaxLoc<Scalar, Index, Space>> {
     val.val = reduction_identity<scalar_type>::max();
     val.loc = reduction_identity<index_type>::min();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -389,10 +337,6 @@ struct OpenMPTargetReducerWrapper<MinMax<Scalar, Space>> {
   // Required
   using value_type = MinMaxScalar<scalar_type>;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -419,7 +363,6 @@ struct OpenMPTargetReducerWrapper<MinMax<Scalar, Space>> {
     val.max_val = reduction_identity<scalar_type>::max();
     val.min_val = reduction_identity<scalar_type>::min();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Index, class Space>
@@ -432,10 +375,6 @@ struct OpenMPTargetReducerWrapper<MinMaxLoc<Scalar, Index, Space>> {
   // Required
   using value_type = MinMaxLocScalar<scalar_type, index_type>;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -468,7 +407,6 @@ struct OpenMPTargetReducerWrapper<MinMaxLoc<Scalar, Index, Space>> {
     val.max_loc = reduction_identity<index_type>::min();
     val.min_loc = reduction_identity<index_type>::min();
   }
-#pragma omp end declare target
 };
 /*
 template<class ReducerType>
@@ -560,47 +498,20 @@ class OpenMPTargetExecTeamMember {
   void* m_glb_scratch;
   void* m_reduce_scratch;
 
-  /*
-  // Fan-in team threads, root of the fan-in which does not block returns true
-  inline
-  bool team_fan_in() const
-    {
-      memory_fence();
-      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! (
-  m_team_rank_rev & n ) ; n <<= 1 ) {
-
-        m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active );
-      }
-
-      if ( m_team_rank_rev ) {
-        m_exec.state_set( Rendezvous );
-        memory_fence();
-        m_exec.state_wait( Rendezvous );
-      }
-
-      return 0 == m_team_rank_rev ;
-    }
-
-  inline
-  void team_fan_out() const
-    {
-      memory_fence();
-      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! (
-  m_team_rank_rev & n ) ; n <<= 1 ) { m_exec.pool_rev( m_team_base_rev + j
-  )->state_set( Active ); memory_fence();
-      }
-    }
-  */
  public:
   KOKKOS_INLINE_FUNCTION
   const execution_space::scratch_memory_space& team_shmem() const {
     return m_team_shared.set_team_thread_mode(0, 1, 0);
   }
 
+  // set_team_thread_mode routine parameters for future understanding:
+  // first parameter - scratch level.
+  // second parameter - size multiplier for advancing scratch ptr after a
+  // request was serviced. third parameter - offset size multiplier from current
+  // scratch ptr when returning a ptr for a request.
   KOKKOS_INLINE_FUNCTION
   const execution_space::scratch_memory_space& team_scratch(int level) const {
-    return m_team_shared.set_team_thread_mode(level, 1,
-                                              m_team_scratch_size[level]);
+    return m_team_shared.set_team_thread_mode(level, 1, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -627,8 +538,9 @@ class OpenMPTargetExecTeamMember {
     using type =
         typename std::conditional<(sizeof(ValueType) < TEAM_REDUCE_SIZE),
                                   ValueType, void>::type;
-    type* team_scratch = reinterpret_cast<type*>(
-        ((char*)(m_glb_scratch) + TEAM_REDUCE_SIZE * omp_get_team_num()));
+    type* team_scratch =
+        reinterpret_cast<type*>(static_cast<char*>(m_glb_scratch) +
+                                TEAM_REDUCE_SIZE * omp_get_team_num());
 #pragma omp barrier
     if (team_rank() == thread_id) *team_scratch = value;
 #pragma omp barrier
@@ -656,7 +568,8 @@ class OpenMPTargetExecTeamMember {
 
     const int n_values = TEAM_REDUCE_SIZE / sizeof(value_type);
     type* team_scratch =
-        (type*)((char*)m_glb_scratch + TEAM_REDUCE_SIZE * omp_get_team_num());
+        reinterpret_cast<type*>(static_cast<char*>(m_glb_scratch) +
+                                TEAM_REDUCE_SIZE * omp_get_team_num());
     for (int i = m_team_rank; i < n_values; i += m_team_size) {
       team_scratch[i] = value_type();
     }
@@ -770,27 +683,24 @@ class OpenMPTargetExecTeamMember {
         m_shmem_block_index(shmem_block_index),
         m_glb_scratch(glb_scratch) {
     const int omp_tid = omp_get_thread_num();
-    m_team_shared     = scratch_memory_space(
-        ((char*)glb_scratch +
-         m_shmem_block_index *
-             (shmem_size_L0 + shmem_size_L1 +
-              ((shmem_size_L0 + shmem_size_L1) * 10 / 100) + TEAM_REDUCE_SIZE)),
-        shmem_size_L0,
-        ((char*)glb_scratch +
-         m_shmem_block_index * (shmem_size_L0 + shmem_size_L1 +
-                                ((shmem_size_L0 + shmem_size_L1) * 10 / 100) +
-                                TEAM_REDUCE_SIZE)) +
-            shmem_size_L0 + ((shmem_size_L0 + shmem_size_L1) * 10 / 100) +
-            TEAM_REDUCE_SIZE,
-        shmem_size_L1);
-    m_reduce_scratch =
-        (char*)glb_scratch +
-        shmem_block_index *
-            (shmem_size_L0 + shmem_size_L1 +
-             ((shmem_size_L0 + shmem_size_L1) * 10 / 100) + TEAM_REDUCE_SIZE);
-    m_league_rank = league_rank;
-    m_team_rank   = omp_tid;
-    m_vector_lane = 0;
+
+    // The scratch memory allocated is a sum of TEAM_REDUCE_SIZE, L0 shmem size
+    // and L1 shmem size. TEAM_REDUCE_SIZE = 512 bytes saved per team for
+    // hierarchical reduction. There is an additional 10% of the requested
+    // scratch memory allocated per team as padding. Hence the product with 0.1.
+    const int reduce_offset =
+        m_shmem_block_index *
+        (shmem_size_L0 + shmem_size_L1 +
+         ((shmem_size_L0 + shmem_size_L1) * 0.1) + TEAM_REDUCE_SIZE);
+    const int l0_offset = reduce_offset + TEAM_REDUCE_SIZE;
+    const int l1_offset = l0_offset + shmem_size_L0;
+    m_team_shared       = scratch_memory_space(
+        (static_cast<char*>(glb_scratch) + l0_offset), shmem_size_L0,
+        static_cast<char*>(glb_scratch) + l1_offset, shmem_size_L1);
+    m_reduce_scratch = static_cast<char*>(glb_scratch) + reduce_offset;
+    m_league_rank    = league_rank;
+    m_team_rank      = omp_tid;
+    m_vector_lane    = 0;
   }
 
   static inline int team_reduce_size() { return TEAM_REDUCE_SIZE; }
@@ -877,6 +787,16 @@ class TeamPolicyInternal<Kokkos::Experimental::OpenMPTarget, Properties...>
   friend class TeamPolicyInternal;
 
  public:
+  // FIXME_OPENMPTARGET : Currently this routine is a copy of the Cuda
+  // implementation, but this has to be tailored to be architecture specific.
+  inline static int scratch_size_max(int level) {
+    return (
+        level == 0 ? 1024 * 40 :  // 48kB is the max for CUDA, but we need some
+                                  // for team_member.reduce etc.
+            20 * 1024 *
+                1024);  // arbitrarily setting this to 20MB, for a Volta V100
+                        // that would give us about 3.2GB for 2 teams per SM
+  }
   inline bool impl_auto_team_size() const { return m_tune_team_size; }
   inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
   inline void impl_set_team_size(const size_t size) { m_team_size = size; }
@@ -884,9 +804,11 @@ class TeamPolicyInternal<Kokkos::Experimental::OpenMPTarget, Properties...>
     m_tune_vector_length = length;
   }
   inline int impl_vector_length() const { return m_vector_length; }
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   KOKKOS_DEPRECATED inline int vector_length() const {
     return impl_vector_length();
   }
+#endif
   inline int team_size() const { return m_team_size; }
   inline int league_size() const { return m_league_size; }
   inline size_t scratch_size(const int& level, int team_size_ = -1) const {
@@ -1245,21 +1167,12 @@ KOKKOS_INLINE_FUNCTION
       static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
 
 #pragma omp barrier
-  // These three lines all cause crash
   Impl::OpenMPTargetReducerWrapper<ReducerType>::init(TeamThread_scratch[0]);
-//  result.init(TeamThread_scratch[0]);
-//  Impl::OpenMPTargetReducerWrapper<ReducerType> red;
-//  red.init(TeamThread_scratch[0]);
 #pragma omp barrier
 
 #pragma omp for reduction(custominner : TeamThread_scratch[:1])
   for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-    ValueType tmp;
-    result.init(tmp);
-    lambda(i, tmp);
-    // This line causes a crash
-    Impl::OpenMPTargetReducerWrapper<ReducerType>::join(TeamThread_scratch[0],
-                                                        tmp);
+    lambda(i, TeamThread_scratch[0]);
   }
   result.reference() = TeamThread_scratch[0];
 }
@@ -1305,6 +1218,12 @@ KOKKOS_INLINE_FUNCTION
          i += team_size) {
       lambda(i, tmp2);
     }
+
+    // FIXME_OPENMPTARGET: Join should work but doesn't. Every threads gets a
+    // private TeamThread_scratch[0] and at the end of the for-loop the `join`
+    // operation is performed by OpenMP itself and hence the simple assignment
+    // works.
+    //    result.join(TeamThread_scratch[0], tmp2);
     TeamThread_scratch[0] = tmp2;
   }
 
@@ -1336,28 +1255,31 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce(
   static_assert(sizeof(ValueType) <=
                 Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
 
+  // FIXME_OPENMPTARGET: Still need to figure out how to get value_count here.
+  const int value_count = 1;
+
 #pragma omp barrier
   TeamThread_scratch[0] = init_result;
 #pragma omp barrier
 
-  if constexpr (std::is_arithmetic<ValueType>::value) {
-#pragma omp for reduction(+ : TeamThread_scratch[:1])
-    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-      ValueType tmp = ValueType();
-      lambda(i, tmp);
-      TeamThread_scratch[0] += tmp;
-    }
-  } else {
-#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
-
-#pragma omp for reduction(custom : TeamThread_scratch[:1])
-    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-      ValueType tmp = ValueType();
-      lambda(i, tmp);
-      join(TeamThread_scratch[0], tmp);
+#pragma omp for
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+    lambda(i, TeamThread_scratch[omp_get_num_threads() * value_count]);
+  }
+
+  // Reduce all partial results within a team.
+  const int team_size      = omp_get_num_threads();
+  int tree_neighbor_offset = 1;
+  do {
+#pragma omp for
+    for (int i = 0; i < team_size - tree_neighbor_offset;
+         i += 2 * tree_neighbor_offset) {
+      const int neighbor = i + tree_neighbor_offset;
+      join(lambda, &TeamThread_scratch[i * value_count],
+           &TeamThread_scratch[neighbor * value_count]);
     }
-  }
-
+    tree_neighbor_offset *= 2;
+  } while (tree_neighbor_offset < team_size);
   init_result = TeamThread_scratch[0];
 }
 
@@ -1402,7 +1324,6 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
 }
 
 }  // namespace Kokkos
-#undef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
 
 namespace Kokkos {
 /** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each
@@ -1530,8 +1451,7 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
-       i += loop_boundaries.increment) {
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) {
     lambda(i, scan_val, true);
   }
 }
@@ -1629,9 +1549,7 @@ KOKKOS_INLINE_FUNCTION
 
 #pragma omp for simd reduction(custom : TeamVector_scratch[:1])
   for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-    ValueType tmp = ValueType();
-    lambda(i, tmp);
-    TeamVector_scratch[0] += tmp;
+    lambda(i, TeamVector_scratch[0]);
   }
 
   result.reference() = TeamVector_scratch[0];
@@ -1686,7 +1604,9 @@ KOKKOS_INLINE_FUNCTION
 #endif  // KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
 }  // namespace Kokkos
 
+#ifdef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
 #undef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
+#endif
 
 namespace Kokkos {
 
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
index 4a79b72732dafb9bd93613723551ec7a9b01ddd1..e421edc5b4c108b03fb8680863184828df243dd7 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
@@ -59,7 +59,34 @@
 namespace Kokkos {
 namespace Experimental {
 namespace Impl {
-void OpenMPTargetInternal::fence() {}
+uint32_t OpenMPTargetInternal::impl_get_instance_id() const noexcept {
+  return m_instance_id;
+}
+
+void OpenMPTargetInternal::fence(openmp_fence_is_static is_static) {
+  fence(
+      "Kokkos::Experimental::Impl::OpenMPTargetInternal::fence: Unnamed "
+      "Internal Fence",
+      is_static);
+}
+void OpenMPTargetInternal::fence(const std::string& name,
+                                 openmp_fence_is_static is_static) {
+  if (is_static == openmp_fence_is_static::no) {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event<
+        Kokkos::Experimental::OpenMPTarget>(
+        name,
+        Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{
+            impl_get_instance_id()},
+        [&]() {});
+  } else {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event<
+        Kokkos::Experimental::OpenMPTarget>(
+        name,
+        Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+            GlobalDeviceSynchronization,
+        [&]() {});
+  }
+}
 int OpenMPTargetInternal::concurrency() { return 128000; }
 const char* OpenMPTargetInternal::name() { return "OpenMPTarget"; }
 void OpenMPTargetInternal::print_configuration(std::ostream& /*stream*/,
@@ -77,7 +104,18 @@ void OpenMPTargetInternal::impl_finalize() {
     Kokkos::kokkos_free<Kokkos::Experimental::OpenMPTargetSpace>(
         space.m_uniquetoken_ptr);
 }
-void OpenMPTargetInternal::impl_initialize() { m_is_initialized = true; }
+void OpenMPTargetInternal::impl_initialize() {
+  m_is_initialized = true;
+
+  // FIXME_OPENMPTARGET:  Only fix the number of teams for NVIDIA architectures
+  // from Pascal and upwards.
+#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
+    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
+  omp_set_num_teams(512);
+#endif
+#endif
+}
 int OpenMPTargetInternal::impl_is_initialized() {
   return m_is_initialized ? 1 : 0;
 }
@@ -100,11 +138,28 @@ void OpenMPTarget::print_configuration(std::ostream& stream,
   m_space_instance->print_configuration(stream, detail);
 }
 
+uint32_t OpenMPTarget::impl_instance_id() const noexcept {
+  return m_space_instance->impl_get_instance_id();
+}
+
 int OpenMPTarget::concurrency() {
   return Impl::OpenMPTargetInternal::impl_singleton()->concurrency();
 }
 void OpenMPTarget::fence() {
-  Impl::OpenMPTargetInternal::impl_singleton()->fence();
+  Impl::OpenMPTargetInternal::impl_singleton()->fence(
+      "Kokkos::OpenMPTarget::fence: Unnamed Instance Fence");
+}
+void OpenMPTarget::fence(const std::string& name) {
+  Impl::OpenMPTargetInternal::impl_singleton()->fence(name);
+}
+void OpenMPTarget::impl_static_fence() {
+  Impl::OpenMPTargetInternal::impl_singleton()->fence(
+      "Kokkos::OpenMPTarget::fence: Unnamed Instance Fence",
+      Kokkos::Experimental::Impl::openmp_fence_is_static::yes);
+}
+void OpenMPTarget::impl_static_fence(const std::string& name) {
+  Impl::OpenMPTargetInternal::impl_singleton()->fence(
+      name, Kokkos::Experimental::Impl::openmp_fence_is_static::yes);
 }
 
 void OpenMPTarget::impl_initialize() { m_space_instance->impl_initialize(); }
@@ -146,7 +201,10 @@ void OpenMPTargetSpaceInitializer::finalize(const bool all_spaces) {
 }
 
 void OpenMPTargetSpaceInitializer::fence() {
-  Kokkos::Experimental::OpenMPTarget::fence();
+  Kokkos::Experimental::OpenMPTarget::impl_static_fence();
+}
+void OpenMPTargetSpaceInitializer::fence(const std::string& name) {
+  Kokkos::Experimental::OpenMPTarget::impl_static_fence(name);
 }
 
 void OpenMPTargetSpaceInitializer::print_configuration(std::ostream& msg,
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp
index a1caf90c195b98511b4476db73345816af2b4669..b495771190e35c661df26a5ab3bc1d53a544cff7 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp
@@ -51,6 +51,8 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
+enum class openmp_fence_is_static { yes, no };
+
 class OpenMPTargetInternal {
  private:
   OpenMPTargetInternal()                            = default;
@@ -58,7 +60,9 @@ class OpenMPTargetInternal {
   OpenMPTargetInternal& operator=(const OpenMPTargetInternal&) = default;
 
  public:
-  void fence();
+  void fence(openmp_fence_is_static is_static = openmp_fence_is_static::no);
+  void fence(const std::string& name,
+             openmp_fence_is_static is_static = openmp_fence_is_static::no);
 
   /** \brief  Return the maximum amount of concurrency.  */
   int concurrency();
@@ -73,14 +77,16 @@ class OpenMPTargetInternal {
 
   //! Has been initialized
   int impl_is_initialized();
-
+  uint32_t impl_get_instance_id() const noexcept;
   //! Initialize, telling the CUDA run-time library which device to use.
   void impl_initialize();
 
   static OpenMPTargetInternal* impl_singleton();
 
  private:
-  bool m_is_initialized = false;
+  bool m_is_initialized  = false;
+  uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance<
+      Kokkos::Experimental::OpenMPTarget>(reinterpret_cast<uintptr_t>(this));
 };
 }  // Namespace Impl
 }  // Namespace Experimental
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
index a4092c3a37a7e9a1493576c5efe783334982a391..08a3109408bd88bcfa04f3fe31d09f5a6e1cff2c 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
@@ -51,8 +51,6 @@
 #include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 
-#define KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL
-
 namespace Kokkos {
 namespace Impl {
 
@@ -69,24 +67,10 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
   const Policy m_policy;
 
  public:
-  inline void execute() const { execute_impl<WorkTag>(); }
-  /*
-    template <class TagType>
-    inline typename std::enable_if<std::is_same<TagType, void>::value>::type
-    execute_impl() const {
-      OpenMPTargetExec::verify_is_process(
-          "Kokkos::Experimental::OpenMPTarget parallel_for");
-      OpenMPTargetExec::verify_initialized(
-          "Kokkos::Experimental::OpenMPTarget parallel_for");
-      const typename Policy::member_type begin = m_policy.begin();
-      const typename Policy::member_type end   = m_policy.end();
-
-  #pragma omp target teams distribute parallel for map(to: this->m_functor)
-      for (int i = begin; i < end; i++) m_functor(i);
-    }
-  */
+  void execute() const { execute_impl<WorkTag>(); }
+
   template <class TagType>
-  inline void execute_impl() const {
+  void execute_impl() const {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -98,16 +82,17 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
 
     FunctorType a_functor(m_functor);
 
-    if constexpr (std::is_same<TagType, void>::value) {
 #pragma omp target teams distribute parallel for map(to : a_functor)
-      for (auto i = begin; i < end; i++) a_functor(i);
-    } else {
-#pragma omp target teams distribute parallel for map(to : a_functor)
-      for (auto i = begin; i < end; i++) a_functor(TagType(), i);
+    for (auto i = begin; i < end; ++i) {
+      if constexpr (std::is_same<TagType, void>::value) {
+        a_functor(i);
+      } else {
+        a_functor(TagType(), i);
+      }
     }
   }
 
-  inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy)
+  ParallelFor(const FunctorType& arg_functor, Policy arg_policy)
       : m_functor(arg_functor), m_policy(arg_policy) {}
 };
 
@@ -120,12 +105,31 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
 namespace Kokkos {
 namespace Impl {
 
+// This class has the memcpy routine that is commonly used by ParallelReduce
+// over RangePolicy and TeamPolicy.
+template <class PointerType>
+struct ParallelReduceCommon {
+  // Copy the result back to device if the view is on the device.
+  static void memcpy_result(PointerType dest, PointerType src, size_t size,
+                            bool ptr_on_device) {
+    if (ptr_on_device) {
+      OMPT_SAFE_CALL(omp_target_memcpy(dest, src, size, 0, 0,
+                                       omp_get_default_device(),
+                                       omp_get_initial_device()));
+    } else {
+      *dest = *src;
+    }
+  }
+};
+
 template <class FunctorType, class PolicyType, class ReducerType,
-          class PointerType, class ValueType, bool FunctorHasJoin,
-          bool UseReducerType>
+          class PointerType, class ValueType>
 struct ParallelReduceSpecialize {
-  static inline void execute(const FunctorType& /*f*/, const PolicyType& /*p*/,
+  inline static void execute(const FunctorType& /*f*/, const PolicyType& /*p*/,
                              PointerType /*result_ptr*/) {
+    constexpr int FunctorHasJoin = ReduceFunctorHasJoin<FunctorType>::value;
+    constexpr int UseReducerType = is_reducer_type<ReducerType>::value;
+
     std::stringstream error_message;
     error_message << "Error: Invalid Specialization " << FunctorHasJoin << ' '
                   << UseReducerType << '\n';
@@ -137,12 +141,26 @@ struct ParallelReduceSpecialize {
 template <class FunctorType, class ReducerType, class PointerType,
           class ValueType, class... PolicyArgs>
 struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
-                                ReducerType, PointerType, ValueType, false,
-                                false> {
+                                ReducerType, PointerType, ValueType> {
   using PolicyType = Kokkos::RangePolicy<PolicyArgs...>;
-  template <class TagType>
-  inline static void execute_impl(const FunctorType& f, const PolicyType& p,
-                                  PointerType result_ptr) {
+  using TagType    = typename PolicyType::work_tag;
+  using ReducerTypeFwd =
+      typename std::conditional<std::is_same<InvalidType, ReducerType>::value,
+                                FunctorType, ReducerType>::type;
+  using WorkTagFwd =
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, TagType,
+                         void>;
+
+  using ValueTraits =
+      Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
+  using ValueInit     = Kokkos::Impl::FunctorValueInit<FunctorType, TagType>;
+  using ValueJoin     = Kokkos::Impl::FunctorValueJoin<FunctorType, TagType>;
+  using ReferenceType = typename ValueTraits::reference_type;
+
+  using ParReduceCommon = ParallelReduceCommon<PointerType>;
+
+  static void execute_reducer(const FunctorType& f, const PolicyType& p,
+                              PointerType result_ptr, bool ptr_on_device) {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -153,69 +171,220 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
     if (end <= begin) return;
 
     ValueType result = ValueType();
-    if constexpr (std::is_same<TagType, void>::value) {
-#pragma omp target teams distribute parallel for num_teams(512) \
-                map(to:f) map(tofrom:result) reduction(+: result)
-      for (auto i = begin; i < end; i++) f(i, result);
-    } else {
-#pragma omp target teams distribute parallel for num_teams(512) \
-                map(to:f) map(tofrom:result) reduction(+: result)
-      for (auto i = begin; i < end; i++) f(TagType(), i, result);
-    }
-
-    *result_ptr = result;
-  }
-
-  inline static void execute(const FunctorType& f, const PolicyType& p,
-                             PointerType ptr) {
-    execute_impl<typename PolicyType::work_tag>(f, p, ptr);
-  }
-};
 
-template <class FunctorType, class PolicyType, class ReducerType,
-          class PointerType, class ValueType>
-struct ParallelReduceSpecialize<FunctorType, PolicyType, ReducerType,
-                                PointerType, ValueType, false, true> {
 #pragma omp declare reduction(                                         \
     custom:ValueType                                                   \
     : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
     initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
 
-  template <class TagType>
-  inline static void execute_impl(const FunctorType& f, const PolicyType& p,
-                                  PointerType result_ptr) {
+    OpenMPTargetReducerWrapper<ReducerType>::init(result);
+#pragma omp target teams distribute parallel for map(to                    \
+                                                     : f) reduction(custom \
+                                                                    : result)
+    for (auto i = begin; i < end; ++i) {
+      if constexpr (std::is_same<TagType, void>::value) {
+        f(i, result);
+      } else {
+        f(TagType(), i, result);
+      }
+    }
+
+    ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType),
+                                   ptr_on_device);
+  }
+
+  template <class TagType, int NumReductions>
+  static void execute_array(const FunctorType& f, const PolicyType& p,
+                            PointerType result_ptr, bool ptr_on_device) {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
-    const typename PolicyType::member_type begin = p.begin();
-    const typename PolicyType::member_type end   = p.end();
+    const auto begin = p.begin();
+    const auto end   = p.end();
 
     if (end <= begin) return;
 
     ValueType result = ValueType();
-    OpenMPTargetReducerWrapper<ReducerType>::init(result);
 
-    if constexpr (std::is_same<TagType, void>::value) {
-#pragma omp target teams distribute parallel for num_teams(512) map(to   \
-                                                                    : f) \
-    reduction(custom                                                     \
-              : result)
-      for (auto i = begin; i < end; i++) f(i, result);
-      *result_ptr = result;
+    // Enter the loop if the reduction is on a scalar type.
+    if constexpr (NumReductions == 1) {
+      // Case where reduction is on a native data type.
+      if constexpr (std::is_arithmetic<ValueType>::value) {
+#pragma omp target teams distribute parallel for \
+         map(to:f) reduction(+: result)
+        for (auto i = begin; i < end; ++i)
+
+          if constexpr (std::is_same<TagType, void>::value) {
+            f(i, result);
+          } else {
+            f(TagType(), i, result);
+          }
+      } else {
+#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
+#pragma omp target teams distribute parallel for map(to                    \
+                                                     : f) reduction(custom \
+                                                                    : result)
+        for (auto i = begin; i < end; ++i)
+
+          if constexpr (std::is_same<TagType, void>::value) {
+            f(i, result);
+          } else {
+            f(TagType(), i, result);
+          }
+      }
     } else {
-#pragma omp target teams distribute parallel for num_teams(512) map(to   \
-                                                                    : f) \
-    reduction(custom                                                     \
-              : result)
-      for (auto i = begin; i < end; i++) f(TagType(), i, result);
-      *result_ptr = result;
+#pragma omp target teams distribute parallel for map(to:f) reduction(+:result[:NumReductions])
+      for (auto i = begin; i < end; ++i) {
+        if constexpr (std::is_same<TagType, void>::value) {
+          f(i, result);
+        } else {
+          f(TagType(), i, result);
+        }
+      }
     }
+
+    ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType),
+                                   ptr_on_device);
   }
 
-  inline static void execute(const FunctorType& f, const PolicyType& p,
-                             PointerType ptr) {
-    execute_impl<typename PolicyType::work_tag>(f, p, ptr);
+  static void execute_init_join(const FunctorType& f, const PolicyType& p,
+                                PointerType ptr, const bool ptr_on_device) {
+    const auto begin = p.begin();
+    const auto end   = p.end();
+
+    constexpr int HasInit = ReduceFunctorHasInit<FunctorType>::value;
+
+    // Initialize the result pointer.
+
+    const auto size = end - begin;
+
+    // FIXME_OPENMPTARGET: The team size and MAX_ACTIVE_THREADS are currently
+    // based on NVIDIA-V100 and should be modifid to be based on the
+    // architecture in the future.
+    const int max_team_threads = 32;
+    const int max_teams =
+        OpenMPTargetExec::MAX_ACTIVE_THREADS / max_team_threads;
+    // Number of elements in the reduction
+    const auto value_count =
+        FunctorValueTraits<FunctorType, TagType>::value_count(f);
+
+    // Allocate scratch per active thread. Achieved by setting the first
+    // parameter of `resize_scratch=1`.
+    OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType));
+    ValueType* scratch_ptr =
+        static_cast<ValueType*>(OpenMPTargetExec::get_scratch_ptr());
+
+#pragma omp target map(to : f) is_device_ptr(scratch_ptr)
+    {
+      // Enter this loop if the functor has an `init`
+      if constexpr (HasInit) {
+        // The `init` routine needs to be called on the device since it might
+        // need device members.
+        ValueInit::init(f, scratch_ptr);
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+          FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+      } else {
+        for (int i = 0; i < value_count; ++i) {
+          static_cast<ValueType*>(scratch_ptr)[i] = ValueType();
+        }
+
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+          FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+      }
+    }
+
+    if (end <= begin) {
+      // If there is no work to be done, copy back the initialized values and
+      // exit.
+      if (!ptr_on_device)
+        OMPT_SAFE_CALL(omp_target_memcpy(
+            ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+            omp_get_initial_device(), omp_get_default_device()));
+      else
+        OMPT_SAFE_CALL(omp_target_memcpy(
+            ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+            omp_get_default_device(), omp_get_default_device()));
+
+      return;
+    }
+
+#pragma omp target teams num_teams(max_teams) thread_limit(max_team_threads) \
+    map(to                                                                   \
+        : f) is_device_ptr(scratch_ptr)
+    {
+#pragma omp parallel
+      {
+        const int team_num    = omp_get_team_num();
+        const int num_teams   = omp_get_num_teams();
+        const auto chunk_size = size / num_teams;
+        const auto team_begin = begin + team_num * chunk_size;
+        const auto team_end =
+            (team_num == num_teams - 1) ? end : (team_begin + chunk_size);
+        ValueType* team_scratch =
+            scratch_ptr + team_num * max_team_threads * value_count;
+        ReferenceType result = ValueInit::init(
+            f, &team_scratch[omp_get_thread_num() * value_count]);
+
+        // Accumulate partial results in thread specific storage.
+#pragma omp for simd
+        for (auto i = team_begin; i < team_end; ++i) {
+          if constexpr (std::is_same<TagType, void>::value) {
+            f(i, result);
+          } else {
+            f(TagType(), i, result);
+          }
+        }
+
+        // Reduce all paritial results within a team.
+        const int team_size      = max_team_threads;
+        int tree_neighbor_offset = 1;
+        do {
+#pragma omp for simd
+          for (int i = 0; i < team_size - tree_neighbor_offset;
+               i += 2 * tree_neighbor_offset) {
+            const int neighbor = i + tree_neighbor_offset;
+            ValueJoin::join(f, &team_scratch[i * value_count],
+                            &team_scratch[neighbor * value_count]);
+          }
+          tree_neighbor_offset *= 2;
+        } while (tree_neighbor_offset < team_size);
+      }  // end parallel
+    }    // end target
+
+    int tree_neighbor_offset = 1;
+    do {
+#pragma omp target teams distribute parallel for simd map(to   \
+                                                          : f) \
+    is_device_ptr(scratch_ptr)
+      for (int i = 0; i < max_teams - tree_neighbor_offset;
+           i += 2 * tree_neighbor_offset) {
+        ValueType* team_scratch = scratch_ptr;
+        const int team_offset   = max_team_threads * value_count;
+        ValueJoin::join(
+            f, &team_scratch[i * team_offset],
+            &team_scratch[(i + tree_neighbor_offset) * team_offset]);
+
+        // If `final` is provided by the functor.
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value) {
+          // Do the final only once at the end.
+          if (tree_neighbor_offset * 2 >= max_teams &&
+              omp_get_team_num() == 0 && omp_get_thread_num() == 0)
+            FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+        }
+      }
+      tree_neighbor_offset *= 2;
+    } while (tree_neighbor_offset < max_teams);
+
+    // If the result view is on the host, copy back the values via memcpy.
+    if (!ptr_on_device)
+      OMPT_SAFE_CALL(omp_target_memcpy(
+          ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+          omp_get_initial_device(), omp_get_default_device()));
+    else
+      OMPT_SAFE_CALL(omp_target_memcpy(
+          ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+          omp_get_default_device(), omp_get_default_device()));
   }
 };
 
@@ -227,47 +396,77 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
 
   using WorkTag   = typename Policy::work_tag;
   using WorkRange = typename Policy::WorkRange;
-  using Member    = typename Policy::member_type;
 
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
+  using ReducerTypeFwd =
+      typename std::conditional<std::is_same<InvalidType, ReducerType>::value,
+                                FunctorType, ReducerType>::type;
   using WorkTagFwd =
       std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
                          void>;
 
-  // Static Assert WorkTag void if ReducerType not InvalidType
-
   using ValueTraits =
       Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
-  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
-
-  enum { HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
-  enum { UseReducer = is_reducer_type<ReducerType>::value };
 
   using pointer_type   = typename ValueTraits::pointer_type;
   using reference_type = typename ValueTraits::reference_type;
 
+  static constexpr int HasJoin    = ReduceFunctorHasJoin<FunctorType>::value;
+  static constexpr int UseReducer = is_reducer_type<ReducerType>::value;
+  static constexpr int IsArray    = std::is_pointer<reference_type>::value;
+
   using ParReduceSpecialize =
       ParallelReduceSpecialize<FunctorType, Policy, ReducerType, pointer_type,
-                               typename ValueTraits::value_type, HasJoin,
-                               UseReducer>;
+                               typename ValueTraits::value_type>;
 
   const FunctorType m_functor;
   const Policy m_policy;
   const ReducerType m_reducer;
   const pointer_type m_result_ptr;
+  bool m_result_ptr_on_device;
+  const int m_result_ptr_num_elems;
+  using TagType = typename Policy::work_tag;
 
  public:
-  inline void execute() const {
-    ParReduceSpecialize::execute(m_functor, m_policy, m_result_ptr);
+  void execute() const {
+    if constexpr (HasJoin) {
+      // Enter this loop if the Functor has a init-join.
+      ParReduceSpecialize::execute_init_join(m_functor, m_policy, m_result_ptr,
+                                             m_result_ptr_on_device);
+    } else if constexpr (UseReducer) {
+      // Enter this loop if the Functor is a reducer type.
+      ParReduceSpecialize::execute_reducer(m_functor, m_policy, m_result_ptr,
+                                           m_result_ptr_on_device);
+    } else if constexpr (IsArray) {
+      // Enter this loop if the reduction is on an array and the routine is
+      // templated over the size of the array.
+      if (m_result_ptr_num_elems <= 2) {
+        ParReduceSpecialize::template execute_array<TagType, 2>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 4) {
+        ParReduceSpecialize::template execute_array<TagType, 4>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 8) {
+        ParReduceSpecialize::template execute_array<TagType, 8>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 16) {
+        ParReduceSpecialize::template execute_array<TagType, 16>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 32) {
+        ParReduceSpecialize::template execute_array<TagType, 32>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else {
+        Kokkos::abort("array reduction length must be <= 32");
+      }
+    } else {
+      // This loop handles the basic scalar reduction.
+      ParReduceSpecialize::template execute_array<TagType, 1>(
+          m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+    }
   }
 
   template <class ViewType>
-  inline ParallelReduce(
-      const FunctorType& arg_functor, Policy arg_policy,
+  ParallelReduce(
+      const FunctorType& arg_functor, Policy& arg_policy,
       const ViewType& arg_result_view,
       typename std::enable_if<Kokkos::is_view<ViewType>::value &&
                                   !Kokkos::is_reducer_type<ReducerType>::value,
@@ -275,14 +474,23 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
       : m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(InvalidType()),
-        m_result_ptr(arg_result_view.data()) {}
-
-  inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
-                        const ReducerType& reducer)
+        m_result_ptr(arg_result_view.data()),
+        m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ViewType::memory_space>::accessible),
+        m_result_ptr_num_elems(arg_result_view.size()) {}
+
+  ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy,
+                 const ReducerType& reducer)
       : m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {}
+        m_result_ptr(reducer.view().data()),
+        m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible),
+        m_result_ptr_num_elems(reducer.view().size()) {}
 };
 
 }  // namespace Impl
@@ -318,20 +526,20 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
   const Policy m_policy;
 
   template <class TagType>
-  inline typename std::enable_if<std::is_same<TagType, void>::value>::type
+  typename std::enable_if<std::is_same<TagType, void>::value>::type
   call_with_tag(const FunctorType& f, const idx_type& idx, value_type& val,
                 const bool& is_final) const {
     f(idx, val, is_final);
   }
   template <class TagType>
-  inline typename std::enable_if<!std::is_same<TagType, void>::value>::type
+  typename std::enable_if<!std::is_same<TagType, void>::value>::type
   call_with_tag(const FunctorType& f, const idx_type& idx, value_type& val,
                 const bool& is_final) const {
     f(WorkTag(), idx, val, is_final);
   }
 
  public:
-  inline void impl_execute(
+  void impl_execute(
       Kokkos::View<value_type**, Kokkos::LayoutRight,
                    Kokkos::Experimental::OpenMPTargetSpace>
           element_values,
@@ -349,13 +557,13 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
 #pragma omp target teams distribute map(to                             \
                                         : a_functor) num_teams(nteams) \
     thread_limit(team_size)
-    for (idx_type team_id = 0; team_id < n_chunks; team_id++) {
+    for (idx_type team_id = 0; team_id < n_chunks; ++team_id) {
 #pragma omp parallel num_threads(team_size)
       {
         const idx_type local_offset = team_id * chunk_size;
 
 #pragma omp for
-        for (idx_type i = 0; i < chunk_size; i++) {
+        for (idx_type i = 0; i < chunk_size; ++i) {
           const idx_type idx = local_offset + i;
           value_type val;
           ValueInit::init(a_functor, &val);
@@ -366,7 +574,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
         if (omp_get_thread_num() == 0) {
           value_type sum;
           ValueInit::init(a_functor, &sum);
-          for (idx_type i = 0; i < chunk_size; i++) {
+          for (idx_type i = 0; i < chunk_size; ++i) {
             ValueJoin::join(a_functor, &sum, &element_values(team_id, i));
             element_values(team_id, i) = sum;
           }
@@ -377,7 +585,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
           if (Kokkos::atomic_fetch_add(&count(), 1) == n_chunks - 1) {
             value_type sum;
             ValueInit::init(a_functor, &sum);
-            for (idx_type i = 0; i < n_chunks; i++) {
+            for (idx_type i = 0; i < n_chunks; ++i) {
               ValueJoin::join(a_functor, &sum, &chunk_values(i));
               chunk_values(i) = sum;
             }
@@ -389,7 +597,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
 #pragma omp target teams distribute map(to                             \
                                         : a_functor) num_teams(nteams) \
     thread_limit(team_size)
-    for (idx_type team_id = 0; team_id < n_chunks; team_id++) {
+    for (idx_type team_id = 0; team_id < n_chunks; ++team_id) {
 #pragma omp parallel num_threads(team_size)
       {
         const idx_type local_offset = team_id * chunk_size;
@@ -400,7 +608,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
           ValueInit::init(a_functor, &offset_value);
 
 #pragma omp for
-        for (idx_type i = 0; i < chunk_size; i++) {
+        for (idx_type i = 0; i < chunk_size; ++i) {
           const idx_type idx = local_offset + i;
           value_type local_offset_value;
           if (i > 0) {
@@ -415,7 +623,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
     }
   }
 
-  inline void execute() const {
+  void execute() const {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -438,7 +646,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
 
   //----------------------------------------
 
-  inline ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy)
+  ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy)
       : m_functor(arg_functor), m_policy(arg_policy) {}
 
   //----------------------------------------
@@ -455,7 +663,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
   value_type& m_returnvalue;
 
  public:
-  inline void execute() const {
+  void execute() const {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -513,7 +721,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   const int m_shmem_size;
 
  public:
-  inline void execute() const {
+  void execute() const {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -523,7 +731,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
  private:
   template <class TagType>
-  inline void execute_impl() const {
+  void execute_impl() const {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -549,7 +757,6 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     const auto nteams =
         league_size < max_active_teams ? league_size : max_active_teams;
 
-#ifdef KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL
 // Performing our own scheduling of teams to avoid separation of code between
 // teams-distribute and parallel. Gave a 2x performance boost in test cases with
 // the clang compiler. atomic_compare_exchange can be avoided since the standard
@@ -580,49 +787,10 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
       } else
         Kokkos::abort("`num_teams` clause was not respected.\n");
     }
-
-#else
-// Saving the older implementation that uses `atomic_compare_exchange` to
-// calculate the shared memory block index and `distribute` clause to distribute
-// teams.
-#pragma omp target teams distribute map(to                   \
-                                        : a_functor)         \
-    is_device_ptr(scratch_ptr, lock_array) num_teams(nteams) \
-        thread_limit(team_size)
-    for (int i = 0; i < league_size; i++) {
-      int shmem_block_index = -1, lock_team = 99999, iter = -1;
-      iter = (omp_get_team_num() % max_active_teams);
-
-      // Loop as long as a shmem_block_index is not found.
-      while (shmem_block_index == -1) {
-        // Try and acquire a lock on the index.
-        lock_team = atomic_compare_exchange(&lock_array[iter], 0, 1);
-
-        // If lock is acquired assign it to the block index.
-        // lock_team = 0, implies atomic_compare_exchange is successfull.
-        if (lock_team == 0)
-          shmem_block_index = iter;
-        else
-          iter = ++iter % max_active_teams;
-      }
-
-#pragma omp parallel num_threads(team_size)
-      {
-        typename Policy::member_type team(
-            i, league_size, team_size, vector_length, scratch_ptr,
-            shmem_block_index, shmem_size_L0, shmem_size_L1);
-        m_functor(team);
-      }
-
-      // Free the locked block and increment the number of available free
-      // blocks.
-      lock_team = atomic_compare_exchange(&lock_array[shmem_block_index], 1, 0);
-    }
-#endif
   }
 
  public:
-  inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
+  ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
       : m_functor(arg_functor),
         m_policy(arg_policy),
         m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
@@ -633,13 +801,26 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
 template <class FunctorType, class ReducerType, class PointerType,
           class ValueType, class... PolicyArgs>
 struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
-                                ReducerType, PointerType, ValueType, false,
-                                false> {
+                                ReducerType, PointerType, ValueType> {
   using PolicyType = TeamPolicyInternal<PolicyArgs...>;
+  using TagType    = typename PolicyType::work_tag;
+  using ReducerTypeFwd =
+      typename std::conditional<std::is_same<InvalidType, ReducerType>::value,
+                                FunctorType, ReducerType>::type;
+  using WorkTagFwd =
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, TagType,
+                         void>;
 
-  template <class TagType>
-  inline static void execute_impl(const FunctorType& f, const PolicyType& p,
-                                  PointerType result_ptr) {
+  using ValueTraits =
+      Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
+  using ValueInit     = Kokkos::Impl::FunctorValueInit<FunctorType, TagType>;
+  using ValueJoin     = Kokkos::Impl::FunctorValueJoin<FunctorType, TagType>;
+  using ReferenceType = typename ValueTraits::reference_type;
+
+  using ParReduceCommon = ParallelReduceCommon<PointerType>;
+
+  static void execute_reducer(const FunctorType& f, const PolicyType& p,
+                              PointerType result_ptr, bool ptr_on_device) {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -662,11 +843,16 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
     const auto nteams =
         league_size < max_active_teams ? league_size : max_active_teams;
 
-#ifdef KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
 #pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
                                                                        : f) \
-    is_device_ptr(scratch_ptr) reduction(+: result)
-#pragma omp parallel reduction(+ : result)
+    is_device_ptr(scratch_ptr) reduction(custom                             \
+                                         : result)
+#pragma omp parallel reduction(custom : result)
     {
       const int blockIdx = omp_get_team_num();
       const int gridDim  = omp_get_num_teams();
@@ -687,79 +873,27 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
         Kokkos::abort("`num_teams` clause was not respected.\n");
     }
 
-    *result_ptr = result;
-#else
-// Saving the older implementation that uses `atomic_compare_exchange` to
-// calculate the shared memory block index and `distribute` clause to distribute
-// teams.
-#pragma omp target teams distribute num_teams(nteams) thread_limit(team_size) \
-         map(to:f) map(tofrom:result) reduction(+: result) \
-    is_device_ptr(scratch_ptr, lock_array)
-    for (int i = 0; i < league_size; i++) {
-      ValueType inner_result = ValueType();
-      int shmem_block_index = -1, lock_team = 99999, iter = -1;
-      iter = (omp_get_team_num() % max_active_teams);
-
-      // Loop as long as a shmem_block_index is not found.
-      while (shmem_block_index == -1) {
-        // Try and acquire a lock on the index.
-        lock_team = atomic_compare_exchange(&lock_array[iter], 0, 1);
-
-        // If lock is acquired assign it to the block index.
-        // lock_team = 0, implies atomic_compare_exchange is successfull.
-        if (lock_team == 0)
-          shmem_block_index = iter;
-        else
-          iter = ++iter % max_active_teams;
-      }
-#pragma omp parallel num_threads(team_size) reduction(+ : inner_result)
-      {
-        typename PolicyType::member_type team(
-            i, league_size, team_size, vector_length, scratch_ptr,
-            shmem_block_index, shmem_size_L0, shmem_size_L1);
-        f(team, inner_result);
-      }
-      result = inner_result;
-
-      // Free the locked block and increment the number of available free
-      // blocks.
-      lock_team = atomic_compare_exchange(&lock_array[shmem_block_index], 1, 0);
-    }
-
-    *result_ptr = result;
-#endif
-  }
-
-  inline static void execute(const FunctorType& f, const PolicyType& p,
-                             PointerType ptr) {
-    execute_impl<typename PolicyType::work_tag>(f, p, ptr);
+    // Copy results back to device if `parallel_reduce` is on a device view.
+    ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType),
+                                   ptr_on_device);
   }
-};
 
-template <class FunctorType, class ReducerType, class PointerType,
-          class ValueType, class... PolicyArgs>
-struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
-                                ReducerType, PointerType, ValueType, false,
-                                true> {
-  using PolicyType = TeamPolicyInternal<PolicyArgs...>;
-  template <class TagType>
-  inline static void execute_impl(const FunctorType& f, const PolicyType& p,
-                                  PointerType result_ptr) {
+  template <int NumReductions>
+  static void execute_array(const FunctorType& f, const PolicyType& p,
+                            PointerType result_ptr, bool ptr_on_device) {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
 
-#pragma omp declare reduction(                                         \
-    custom:ValueType                                                   \
-    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
-    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
-    const int league_size      = p.league_size();
-    const int team_size        = p.team_size();
-    const int vector_length    = p.impl_vector_length();
+    const int league_size   = p.league_size();
+    const int team_size     = p.team_size();
+    const int vector_length = p.impl_vector_length();
+
     const size_t shmem_size_L0 = p.scratch_size(0, team_size);
     const size_t shmem_size_L1 = p.scratch_size(1, team_size);
-    OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1);
+    OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE,
+                                     shmem_size_L0, shmem_size_L1);
     void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
 
     ValueType result = ValueType();
@@ -769,37 +903,229 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
     const auto nteams =
         league_size < max_active_teams ? league_size : max_active_teams;
 
+    // Case where the number of reduction items is 1.
+    if constexpr (NumReductions == 1) {
+      // Case where reduction is on a native data type.
+      if constexpr (std::is_arithmetic<ValueType>::value) {
+#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
+                                                                       : f) \
+    is_device_ptr(scratch_ptr) reduction(+: result)
+#pragma omp parallel reduction(+ : result)
+        {
+          const int blockIdx = omp_get_team_num();
+          const int gridDim  = omp_get_num_teams();
+
+          // Guarantee that the compilers respect the `num_teams` clause
+          if (gridDim <= nteams) {
+            for (int league_id = blockIdx; league_id < league_size;
+                 league_id += gridDim) {
+              typename PolicyType::member_type team(
+                  league_id, league_size, team_size, vector_length, scratch_ptr,
+                  blockIdx, shmem_size_L0, shmem_size_L1);
+              if constexpr (std::is_same<TagType, void>::value)
+                f(team, result);
+              else
+                f(TagType(), team, result);
+            }
+          } else
+            Kokkos::abort("`num_teams` clause was not respected.\n");
+        }
+      } else {
+        // Case where the reduction is on a non-native data type.
+#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
 #pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
                                                                        : f) \
     is_device_ptr(scratch_ptr) reduction(custom                             \
                                          : result)
 #pragma omp parallel reduction(custom : result)
-    {
-      const int blockIdx = omp_get_team_num();
-      const int gridDim  = omp_get_num_teams();
+        {
+          const int blockIdx = omp_get_team_num();
+          const int gridDim  = omp_get_num_teams();
+
+          // Guarantee that the compilers respect the `num_teams` clause
+          if (gridDim <= nteams) {
+            for (int league_id = blockIdx; league_id < league_size;
+                 league_id += gridDim) {
+              typename PolicyType::member_type team(
+                  league_id, league_size, team_size, vector_length, scratch_ptr,
+                  blockIdx, shmem_size_L0, shmem_size_L1);
+              if constexpr (std::is_same<TagType, void>::value)
+                f(team, result);
+              else
+                f(TagType(), team, result);
+            }
+          } else
+            Kokkos::abort("`num_teams` clause was not respected.\n");
+        }
+      }
+    } else {
+      // Case where the reduction is on an array.
+#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
+                                                                       : f) \
+    is_device_ptr(scratch_ptr) reduction(+ : result[:NumReductions])
+#pragma omp parallel reduction(+ : result[:NumReductions])
+      {
+        const int blockIdx = omp_get_team_num();
+        const int gridDim  = omp_get_num_teams();
+
+        // Guarantee that the compilers respect the `num_teams` clause
+        if (gridDim <= nteams) {
+          for (int league_id = blockIdx; league_id < league_size;
+               league_id += gridDim) {
+            typename PolicyType::member_type team(
+                league_id, league_size, team_size, vector_length, scratch_ptr,
+                blockIdx, shmem_size_L0, shmem_size_L1);
+            if constexpr (std::is_same<TagType, void>::value)
+              f(team, result);
+            else
+              f(TagType(), team, result);
+          }
+        } else
+          Kokkos::abort("`num_teams` clause was not respected.\n");
+      }
+    }
 
-      // Guarantee that the compilers respect the `num_teams` clause
-      if (gridDim <= nteams) {
-        for (int league_id = blockIdx; league_id < league_size;
-             league_id += gridDim) {
+    // Copy results back to device if `parallel_reduce` is on a device view.
+    ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType),
+                                   ptr_on_device);
+  }
+
+  // FIXME_OPENMPTARGET : This routine is a copy from `parallel_reduce` over
+  // RangePolicy. Need a new implementation.
+  static void execute_init_join(const FunctorType& f, const PolicyType& p,
+                                PointerType ptr, const bool ptr_on_device) {
+    const auto begin      = p.begin();
+    const auto end        = p.end();
+    constexpr int HasInit = ReduceFunctorHasInit<FunctorType>::value;
+
+    const auto size = end - begin;
+
+    const int league_size   = p.league_size();
+    const int team_size     = p.team_size();
+    const int vector_length = p.impl_vector_length();
+
+    const size_t shmem_size_L0 = p.scratch_size(0, team_size);
+    const size_t shmem_size_L1 = p.scratch_size(1, team_size);
+
+    // FIXME_OPENMPTARGET: This would oversubscribe scratch memory since we are
+    // already using the available scratch memory to create temporaries for each
+    // thread.
+    if constexpr ((shmem_size_L0 + shmem_size_L1) > 0) {
+      Kokkos::abort(
+          "OpenMPTarget: Scratch memory is not supported in `parallel_reduce` "
+          "over functors with init/join.");
+    }
+
+    // Maximum active teams possible.
+    int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size;
+    const auto nteams =
+        league_size < max_active_teams ? league_size : max_active_teams;
+
+    // Number of elements in the reduction
+    const auto value_count =
+        FunctorValueTraits<FunctorType, TagType>::value_count(f);
+
+    // Allocate scratch per active thread.
+    OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType));
+    void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
+
+    // Enter this loop if the functor has an `init`
+    if constexpr (HasInit) {
+      // The `init` routine needs to be called on the device since it might need
+      // device members.
+#pragma omp target map(to : f) is_device_ptr(scratch_ptr)
+      {
+        ValueInit::init(f, scratch_ptr);
+
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+          FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+      }
+    } else {
+#pragma omp target map(to : f) is_device_ptr(scratch_ptr)
+      {
+        for (int i = 0; i < value_count; ++i) {
+          static_cast<ValueType*>(scratch_ptr)[i] = ValueType();
+        }
+
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+          FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+      }
+    }
+
+    if (end <= begin) {
+      // If there is no work to be done, copy back the initialized values and
+      // exit.
+      if (!ptr_on_device)
+        OMPT_SAFE_CALL(omp_target_memcpy(
+            ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+            omp_get_initial_device(), omp_get_default_device()));
+      else
+        OMPT_SAFE_CALL(omp_target_memcpy(
+            ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+            omp_get_default_device(), omp_get_default_device()));
+
+      return;
+    }
+
+#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
+                                                                       : f) \
+    is_device_ptr(scratch_ptr)
+    {
+#pragma omp parallel
+      {
+        const int team_num      = omp_get_team_num();
+        const int num_teams     = omp_get_num_teams();
+        ValueType* team_scratch = static_cast<ValueType*>(scratch_ptr) +
+                                  team_num * team_size * value_count;
+        ReferenceType result = ValueInit::init(f, &team_scratch[0]);
+
+        for (int league_id = team_num; league_id < league_size;
+             league_id += num_teams) {
           typename PolicyType::member_type team(
               league_id, league_size, team_size, vector_length, scratch_ptr,
-              blockIdx, shmem_size_L0, shmem_size_L1);
-          if constexpr (std::is_same<TagType, void>::value)
+              team_num, shmem_size_L0, shmem_size_L1);
+          if constexpr (std::is_same<TagType, void>::value) {
             f(team, result);
-          else
+          } else {
             f(TagType(), team, result);
+          }
         }
-      } else
-        Kokkos::abort("`num_teams` clause was not respected.\n");
-    }
-
-    *result_ptr = result;
-  }
-
-  inline static void execute(const FunctorType& f, const PolicyType& p,
-                             PointerType ptr) {
-    execute_impl<typename PolicyType::work_tag>(f, p, ptr);
+      }  // end parallel
+    }    // end target
+
+    int tree_neighbor_offset = 1;
+    do {
+#pragma omp target teams distribute parallel for simd map(to   \
+                                                          : f) \
+    is_device_ptr(scratch_ptr)
+      for (int i = 0; i < nteams - tree_neighbor_offset;
+           i += 2 * tree_neighbor_offset) {
+        ValueType* team_scratch = scratch_ptr;
+        const int team_offset   = team_size * value_count;
+        ValueJoin::join(
+            f, &team_scratch[i * team_offset],
+            &team_scratch[(i + tree_neighbor_offset) * team_offset]);
+
+        // If `final` is provided by the functor.
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value) {
+          // Do the final only once at the end.
+          if (tree_neighbor_offset * 2 >= nteams && omp_get_team_num() == 0 &&
+              omp_get_thread_num() == 0)
+            FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+        }
+      }
+      tree_neighbor_offset *= 2;
+    } while (tree_neighbor_offset < nteams);
+
+    // If the result view is on the host, copy back the values via memcpy.
+    if (!ptr_on_device)
+      OMPT_SAFE_CALL(omp_target_memcpy(
+          ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+          omp_get_initial_device(), omp_get_default_device()));
+    else
+      OMPT_SAFE_CALL(omp_target_memcpy(
+          ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+          omp_get_default_device(), omp_get_default_device()));
   }
 };
 
@@ -813,11 +1139,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
   using WorkTag = typename Policy::work_tag;
   using Member  = typename Policy::member_type;
-
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
+  using ReducerTypeFwd =
+      typename std::conditional<std::is_same<InvalidType, ReducerType>::value,
+                                FunctorType, ReducerType>::type;
   using WorkTagFwd =
       std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
                          void>;
@@ -831,13 +1155,16 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   using reference_type = typename ValueTraits::reference_type;
   using value_type     = typename ValueTraits::value_type;
 
-  enum { HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
-  enum { UseReducer = is_reducer_type<ReducerType>::value };
+  bool m_result_ptr_on_device;
+  const int m_result_ptr_num_elems;
+
+  static constexpr int HasJoin    = ReduceFunctorHasJoin<FunctorType>::value;
+  static constexpr int UseReducer = is_reducer_type<ReducerType>::value;
+  static constexpr int IsArray    = std::is_pointer<reference_type>::value;
 
-  using ParForSpecialize =
+  using ParReduceSpecialize =
       ParallelReduceSpecialize<FunctorType, Policy, ReducerType, pointer_type,
-                               typename ValueTraits::value_type, HasJoin,
-                               UseReducer>;
+                               typename ValueTraits::value_type>;
 
   const FunctorType m_functor;
   const Policy m_policy;
@@ -846,18 +1173,50 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   const int m_shmem_size;
 
  public:
-  inline void execute() const {
-    ParForSpecialize::execute(m_functor, m_policy, m_result_ptr);
+  void execute() const {
+    if constexpr (HasJoin) {
+      ParReduceSpecialize::execute_init_join(m_functor, m_policy, m_result_ptr,
+                                             m_result_ptr_on_device);
+    } else if constexpr (UseReducer) {
+      ParReduceSpecialize::execute_reducer(m_functor, m_policy, m_result_ptr,
+                                           m_result_ptr_on_device);
+    } else if constexpr (IsArray) {
+      if (m_result_ptr_num_elems <= 2) {
+        ParReduceSpecialize::template execute_array<2>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 4) {
+        ParReduceSpecialize::template execute_array<4>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 8) {
+        ParReduceSpecialize::template execute_array<8>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 16) {
+        ParReduceSpecialize::template execute_array<16>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 32) {
+        ParReduceSpecialize::template execute_array<32>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else {
+        Kokkos::abort("array reduction length must be <= 32");
+      }
+    } else {
+      ParReduceSpecialize::template execute_array<1>(
+          m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+    }
   }
 
   template <class ViewType>
-  inline ParallelReduce(
+  ParallelReduce(
       const FunctorType& arg_functor, const Policy& arg_policy,
       const ViewType& arg_result,
       typename std::enable_if<Kokkos::is_view<ViewType>::value &&
                                   !Kokkos::is_reducer_type<ReducerType>::value,
                               void*>::type = nullptr)
-      : m_functor(arg_functor),
+      : m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ViewType::memory_space>::accessible),
+        m_result_ptr_num_elems(arg_result.size()),
+        m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(InvalidType()),
         m_result_ptr(arg_result.data()),
@@ -865,9 +1224,14 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                      FunctorTeamShmemSize<FunctorType>::value(
                          arg_functor, arg_policy.team_size())) {}
 
-  inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
-                        const ReducerType& reducer)
-      : m_functor(arg_functor),
+  ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy,
+                 const ReducerType& reducer)
+      : m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible),
+        m_result_ptr_num_elems(reducer.view().size()),
+        m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(reducer),
         m_result_ptr(reducer.view().data()),
@@ -889,11 +1253,11 @@ struct TeamThreadRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> {
   const iType end;
   const OpenMPTargetExecTeamMember& team;
 
-  inline TeamThreadRangeBoundariesStruct(
-      const OpenMPTargetExecTeamMember& thread_, iType count)
+  TeamThreadRangeBoundariesStruct(const OpenMPTargetExecTeamMember& thread_,
+                                  iType count)
       : start(0), end(count), team(thread_) {}
-  inline TeamThreadRangeBoundariesStruct(
-      const OpenMPTargetExecTeamMember& thread_, iType begin_, iType end_)
+  TeamThreadRangeBoundariesStruct(const OpenMPTargetExecTeamMember& thread_,
+                                  iType begin_, iType end_)
       : start(begin_), end(end_), team(thread_) {}
 };
 
@@ -904,12 +1268,11 @@ struct ThreadVectorRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> {
   const index_type end;
   const OpenMPTargetExecTeamMember& team;
 
-  inline ThreadVectorRangeBoundariesStruct(
-      const OpenMPTargetExecTeamMember& thread_, index_type count)
+  ThreadVectorRangeBoundariesStruct(const OpenMPTargetExecTeamMember& thread_,
+                                    index_type count)
       : start(0), end(count), team(thread_) {}
-  inline ThreadVectorRangeBoundariesStruct(
-      const OpenMPTargetExecTeamMember& thread_, index_type begin_,
-      index_type end_)
+  ThreadVectorRangeBoundariesStruct(const OpenMPTargetExecTeamMember& thread_,
+                                    index_type begin_, index_type end_)
       : start(begin_), end(end_), team(thread_) {}
 };
 
@@ -920,12 +1283,11 @@ struct TeamVectorRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> {
   const index_type end;
   const OpenMPTargetExecTeamMember& team;
 
-  inline TeamVectorRangeBoundariesStruct(
-      const OpenMPTargetExecTeamMember& thread_, index_type count)
+  TeamVectorRangeBoundariesStruct(const OpenMPTargetExecTeamMember& thread_,
+                                  index_type count)
       : start(0), end(count), team(thread_) {}
-  inline TeamVectorRangeBoundariesStruct(
-      const OpenMPTargetExecTeamMember& thread_, index_type begin_,
-      index_type end_)
+  TeamVectorRangeBoundariesStruct(const OpenMPTargetExecTeamMember& thread_,
+                                  index_type begin_, index_type end_)
       : start(begin_), end(end_), team(thread_) {}
 };
 
@@ -935,5 +1297,4 @@ struct TeamVectorRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> {
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-#undef KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL
 #endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp
index 3dfad2bb856e0bb65a48dfd70b3458cee4c9beb5..40d8c45f5d0f8ce6798079d9eb3deb48a4361122 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp
@@ -91,7 +91,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 
 #pragma omp target teams distribute map(to : functor) num_teams(end - begin)
     {
-      for (ptrdiff_t tile_idx = begin; tile_idx < end; tile_idx++) {
+      for (ptrdiff_t tile_idx = begin; tile_idx < end; ++tile_idx) {
 
 #pragma omp parallel
         {
@@ -116,31 +116,6 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 #endif
   }
 
-  template <int Rank>
-  inline typename std::enable_if<Rank == 1>::type execute_tile(
-      typename Policy::point_type offset, const FunctorType& functor,
-      const Policy& policy) const {
-#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES
-    (void)offset;
-    const auto begin_0 = policy.m_lower[0];
-
-    const auto end_0 = policy.m_upper[0];
-
-#pragma omp target teams distribute parallel for map(to : functor)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) {
-      functor(i0);
-    }
-#else
-    const ptrdiff_t begin_0 = offset[0];
-    ptrdiff_t end_0         = begin_0 + policy.m_tile[0];
-    end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0];
-#pragma omp for
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) {
-      functor(i0);
-    }
-#endif
-  }
-
   template <int Rank>
   inline typename std::enable_if<Rank == 2>::type execute_tile(
       typename Policy::point_type offset, const FunctorType& functor,
@@ -154,8 +129,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     const auto end_1 = policy.m_upper[1];
 
 #pragma omp target teams distribute parallel for collapse(2) map(to : functor)
-    for (auto i0 = begin_0; i0 < end_0; i0++) {
-      for (auto i1 = begin_1; i1 < end_1; i1++) {
+    for (auto i0 = begin_0; i0 < end_0; ++i0) {
+      for (auto i1 = begin_1; i1 < end_1; ++i1) {
         if constexpr (std::is_same<typename Policy::work_tag, void>::value)
           functor(i0, i1);
         else
@@ -172,8 +147,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1];
 
 #pragma omp for collapse(2)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++) {
+    for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0)
+      for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) {
         if constexpr (std::is_same<typename Policy::work_tag, void>::value)
           functor(i0, i1);
         else
@@ -197,9 +172,9 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     const auto end_2 = policy.m_upper[2];
 
 #pragma omp target teams distribute parallel for collapse(3) map(to : functor)
-    for (auto i0 = begin_0; i0 < end_0; i0++) {
-      for (auto i1 = begin_1; i1 < end_1; i1++) {
-        for (auto i2 = begin_2; i2 < end_2; i2++) {
+    for (auto i0 = begin_0; i0 < end_0; ++i0) {
+      for (auto i1 = begin_1; i1 < end_1; ++i1) {
+        for (auto i2 = begin_2; i2 < end_2; ++i2) {
           if constexpr (std::is_same<typename Policy::work_tag, void>::value)
             functor(i0, i1, i2);
           else
@@ -221,9 +196,9 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2];
 
 #pragma omp for collapse(3)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++) {
+    for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0)
+      for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1)
+        for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) {
           if constexpr (std::is_same<typename Policy::work_tag, void>::value)
             functor(i0, i1, i2);
           else
@@ -249,10 +224,10 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     const auto end_3 = policy.m_upper[3];
 
 #pragma omp target teams distribute parallel for collapse(4) map(to : functor)
-    for (auto i0 = begin_0; i0 < end_0; i0++) {
-      for (auto i1 = begin_1; i1 < end_1; i1++) {
-        for (auto i2 = begin_2; i2 < end_2; i2++) {
-          for (auto i3 = begin_3; i3 < end_3; i3++) {
+    for (auto i0 = begin_0; i0 < end_0; ++i0) {
+      for (auto i1 = begin_1; i1 < end_1; ++i1) {
+        for (auto i2 = begin_2; i2 < end_2; ++i2) {
+          for (auto i3 = begin_3; i3 < end_3; ++i3) {
             if constexpr (std::is_same<typename Policy::work_tag, void>::value)
               functor(i0, i1, i2, i3);
             else
@@ -279,10 +254,10 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3];
 
 #pragma omp for collapse(4)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++)
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++) {
+    for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0)
+      for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1)
+        for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2)
+          for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3) {
             if constexpr (std::is_same<typename Policy::work_tag, void>::value)
               functor(i0, i1, i2, i3);
             else
@@ -310,11 +285,11 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     const auto end_4 = policy.m_upper[4];
 
 #pragma omp target teams distribute parallel for collapse(5) map(to : functor)
-    for (auto i0 = begin_0; i0 < end_0; i0++) {
-      for (auto i1 = begin_1; i1 < end_1; i1++) {
-        for (auto i2 = begin_2; i2 < end_2; i2++) {
-          for (auto i3 = begin_3; i3 < end_3; i3++) {
-            for (auto i4 = begin_4; i4 < end_4; i4++) {
+    for (auto i0 = begin_0; i0 < end_0; ++i0) {
+      for (auto i1 = begin_1; i1 < end_1; ++i1) {
+        for (auto i2 = begin_2; i2 < end_2; ++i2) {
+          for (auto i3 = begin_3; i3 < end_3; ++i3) {
+            for (auto i4 = begin_4; i4 < end_4; ++i4) {
               if constexpr (std::is_same<typename Policy::work_tag,
                                          void>::value)
                 functor(i0, i1, i2, i3, i4);
@@ -347,11 +322,11 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4];
 
 #pragma omp for collapse(5)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++)
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++)
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++) {
+    for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0)
+      for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1)
+        for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2)
+          for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3)
+            for (ptrdiff_t i4 = begin_4; i4 < end_4; ++i4) {
               if constexpr (std::is_same<typename Policy::work_tag,
                                          void>::value)
                 functor(i0, i1, i2, i3, i4);
@@ -382,12 +357,12 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     const auto end_5 = policy.m_upper[5];
 
 #pragma omp target teams distribute parallel for collapse(6) map(to : functor)
-    for (auto i0 = begin_0; i0 < end_0; i0++) {
-      for (auto i1 = begin_1; i1 < end_1; i1++) {
-        for (auto i2 = begin_2; i2 < end_2; i2++) {
-          for (auto i3 = begin_3; i3 < end_3; i3++) {
-            for (auto i4 = begin_4; i4 < end_4; i4++) {
-              for (auto i5 = begin_5; i5 < end_5; i5++) {
+    for (auto i0 = begin_0; i0 < end_0; ++i0) {
+      for (auto i1 = begin_1; i1 < end_1; ++i1) {
+        for (auto i2 = begin_2; i2 < end_2; ++i2) {
+          for (auto i3 = begin_3; i3 < end_3; ++i3) {
+            for (auto i4 = begin_4; i4 < end_4; ++i4) {
+              for (auto i5 = begin_5; i5 < end_5; ++i5) {
                 {
                   if constexpr (std::is_same<typename Policy::work_tag,
                                              void>::value)
@@ -428,12 +403,12 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     end_5 = end_5 < policy.m_upper[5] ? end_5 : policy.m_upper[5];
 
 #pragma omp for collapse(6)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++)
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++)
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++)
-              for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++) {
+    for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0)
+      for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1)
+        for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2)
+          for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3)
+            for (ptrdiff_t i4 = begin_4; i4 < end_4; ++i4)
+              for (ptrdiff_t i5 = begin_5; i5 < end_5; ++i5) {
                 if constexpr (std::is_same<typename Policy::work_tag,
                                            void>::value)
                   functor(i0, i1, i2, i3, i4, i5);
@@ -443,195 +418,6 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 #endif
   }
 
-  template <int Rank>
-  inline typename std::enable_if<Rank == 7>::type execute_tile(
-      typename Policy::point_type offset, const FunctorType& functor,
-      const Policy& policy) const {
-#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES
-    (void)offset;
-    const int begin_0 = policy.m_lower[0];
-    const int begin_1 = policy.m_lower[1];
-    const int begin_2 = policy.m_lower[2];
-    const int begin_3 = policy.m_lower[3];
-    const int begin_4 = policy.m_lower[4];
-    const int begin_5 = policy.m_lower[5];
-    const int begin_6 = policy.m_lower[6];
-
-    const int end_0 = policy.m_upper[0];
-    const int end_1 = policy.m_upper[1];
-    const int end_2 = policy.m_upper[2];
-    const int end_3 = policy.m_upper[3];
-    const int end_4 = policy.m_upper[4];
-    const int end_5 = policy.m_upper[5];
-    const int end_6 = policy.m_upper[6];
-
-#pragma omp target teams distribute parallel for collapse(7) map(to : functor)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) {
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++) {
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++) {
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++) {
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++) {
-              for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++) {
-                for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++) {
-                  if constexpr (std::is_same<typename Policy::work_tag,
-                                             void>::value)
-                    functor(i0, i1, i2, i3, i4, i5, i6);
-                  else
-                    functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5,
-                            i6);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-#else
-    const ptrdiff_t begin_0 = offset[0];
-    ptrdiff_t end_0         = begin_0 + policy.m_tile[0];
-    end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0];
-
-    const ptrdiff_t begin_1 = offset[1];
-    ptrdiff_t end_1         = begin_1 + policy.m_tile[1];
-    end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1];
-
-    const ptrdiff_t begin_2 = offset[2];
-    ptrdiff_t end_2         = begin_2 + policy.m_tile[2];
-    end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2];
-
-    const ptrdiff_t begin_3 = offset[3];
-    ptrdiff_t end_3         = begin_3 + policy.m_tile[3];
-    end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3];
-
-    const ptrdiff_t begin_4 = offset[4];
-    ptrdiff_t end_4         = begin_4 + policy.m_tile[4];
-    end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4];
-
-    const ptrdiff_t begin_5 = offset[5];
-    ptrdiff_t end_5         = begin_5 + policy.m_tile[5];
-    end_5 = end_5 < policy.m_upper[5] ? end_5 : policy.m_upper[5];
-
-    const ptrdiff_t begin_6 = offset[6];
-    ptrdiff_t end_6         = begin_6 + policy.m_tile[6];
-    end_6 = end_6 < policy.m_upper[6] ? end_6 : policy.m_upper[6];
-
-#pragma omp for collapse(7)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++)
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++)
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++)
-              for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++)
-                for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++) {
-                  if constexpr (std::is_same<typename Policy::work_tag,
-                                             void>::value)
-                    functor(i0, i1, i2, i3, i4, i5, i6);
-                  else
-                    functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5,
-                            i6);
-                }
-#endif
-  }
-
-  template <int Rank>
-  inline typename std::enable_if<Rank == 8>::type execute_tile(
-      typename Policy::point_type offset, const FunctorType& functor,
-      const Policy& policy) const {
-#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES
-    (void)offset;
-    const int begin_0 = policy.m_lower[0];
-    const int begin_1 = policy.m_lower[1];
-    const int begin_2 = policy.m_lower[2];
-    const int begin_3 = policy.m_lower[3];
-    const int begin_4 = policy.m_lower[4];
-    const int begin_5 = policy.m_lower[5];
-    const int begin_6 = policy.m_lower[6];
-    const int begin_7 = policy.m_lower[7];
-
-    const int end_0 = policy.m_upper[0];
-    const int end_1 = policy.m_upper[1];
-    const int end_2 = policy.m_upper[2];
-    const int end_3 = policy.m_upper[3];
-    const int end_4 = policy.m_upper[4];
-    const int end_5 = policy.m_upper[5];
-    const int end_6 = policy.m_upper[6];
-    const int end_7 = policy.m_upper[7];
-
-#pragma omp target teams distribute parallel for collapse(8) map(to : functor)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) {
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++) {
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++) {
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++) {
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++) {
-              for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++) {
-                for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++) {
-                  for (ptrdiff_t i7 = begin_7; i7 < end_7; i7++) {
-                    if constexpr (std::is_same<typename Policy::work_tag,
-                                               void>::value)
-                      functor(i0, i1, i2, i3, i4, i5, i6, i7);
-                    else
-                      functor(typename Policy::work_tag(), i0, i1, i2, i3, i4,
-                              i5, i6, i7);
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-#else
-    const ptrdiff_t begin_0 = offset[0];
-    ptrdiff_t end_0         = begin_0 + policy.m_tile[0];
-    end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0];
-
-    const ptrdiff_t begin_1 = offset[1];
-    ptrdiff_t end_1         = begin_1 + policy.m_tile[1];
-    end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1];
-
-    const ptrdiff_t begin_2 = offset[2];
-    ptrdiff_t end_2         = begin_2 + policy.m_tile[2];
-    end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2];
-
-    const ptrdiff_t begin_3 = offset[3];
-    ptrdiff_t end_3         = begin_3 + policy.m_tile[3];
-    end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3];
-
-    const ptrdiff_t begin_4 = offset[4];
-    ptrdiff_t end_4         = begin_4 + policy.m_tile[4];
-    end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4];
-
-    const ptrdiff_t begin_5 = offset[5];
-    ptrdiff_t end_5         = begin_5 + policy.m_tile[5];
-    end_5 = end_5 < policy.m_upper[5] ? end_5 : policy.m_upper[5];
-
-    const ptrdiff_t begin_6 = offset[6];
-    ptrdiff_t end_6         = begin_6 + policy.m_tile[6];
-    end_6 = end_6 < policy.m_upper[6] ? end_6 : policy.m_upper[6];
-
-    const ptrdiff_t begin_7 = offset[7];
-    ptrdiff_t end_7         = begin_7 + policy.m_tile[7];
-    end_7 = end_7 < policy.m_upper[7] ? end_7 : policy.m_upper[7];
-
-#pragma omp for collapse(8)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++)
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++)
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++)
-              for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++)
-                for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++)
-                  for (ptrdiff_t i7 = begin_7; i7 < end_7; i7++) {
-                    if constexpr (std::is_same<typename Policy::work_tag,
-                                               void>::value)
-                      functor(i0, i1, i2, i3, i4, i5, i6, i7);
-                    else
-                      functor(typename Policy::work_tag(), i0, i1, i2, i3, i4,
-                              i5, i6, i7);
-                  }
-#endif
-  }
-
   inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy)
       : m_functor(arg_functor), m_policy(arg_policy) {}
   // TODO DZP: based on a conversation with Christian, we're using 256 as a
@@ -652,112 +438,6 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 namespace Kokkos {
 namespace Impl {
 
-template <class FunctorType, class ReducerType, class PointerType,
-          class ValueType, class... PolicyArgs>
-struct ParallelReduceSpecialize<FunctorType,
-                                Kokkos::MDRangePolicy<PolicyArgs...>,
-                                ReducerType, PointerType, ValueType, 0, 0> {
-  using PolicyType = Kokkos::RangePolicy<PolicyArgs...>;
-  template <class TagType>
-  inline static
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      execute_impl(const FunctorType& f, const PolicyType& p,
-                   PointerType result_ptr) {
-    OpenMPTargetExec::verify_is_process(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    OpenMPTargetExec::verify_initialized(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    const typename PolicyType::member_type begin = p.begin();
-    const typename PolicyType::member_type end   = p.end();
-
-    ValueType result = ValueType();
-#pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom:result) reduction(+: result)
-    for (int i = begin; i < end; i++) f(i, result);
-
-    *result_ptr = result;
-  }
-
-  template <class TagType>
-  inline static
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      execute_impl(const FunctorType& f, const PolicyType& p,
-                   PointerType result_ptr) {
-    OpenMPTargetExec::verify_is_process(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    OpenMPTargetExec::verify_initialized(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    const typename PolicyType::member_type begin = p.begin();
-    const typename PolicyType::member_type end   = p.end();
-
-    ValueType result = ValueType();
-#pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom: result) reduction(+: result)
-    for (int i = begin; i < end; i++) f(TagType(), i, result);
-
-    *result_ptr = result;
-  }
-
-  inline static void execute(const FunctorType& f, const PolicyType& p,
-                             PointerType ptr) {
-    execute_impl<typename PolicyType::work_tag>(f, p, ptr);
-  }
-};
-/*
-template<class FunctorType, class PolicyType, class ReducerType, class
-PointerType, class ValueType> struct ParallelReduceSpecialize<FunctorType,
-PolicyType, ReducerType, PointerType, ValueType, 0,1> {
-
-  #pragma omp declare reduction(custom: ValueType : ReducerType::join(omp_out,
-omp_in)) initializer ( ReducerType::init(omp_priv) )
-
-  template< class TagType >
-  inline static
-  typename std::enable_if< std::is_same< TagType , void >::value >::type
-  execute_impl(const FunctorType& f, const PolicyType& p, PointerType
-result_ptr)
-    {
-      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget
-parallel_for");
-      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget
-parallel_for"); const typename PolicyType::member_type begin = p.begin(); const
-typename PolicyType::member_type end = p.end();
-
-      ValueType result = ValueType();
-      #pragma omp target teams distribute parallel for num_teams(512) map(to:f)
-map(tofrom:result) reduction(custom: result) for(int i=begin; i<end; i++)
-        f(i,result);
-
-      *result_ptr=result;
-    }
-
-
-  template< class TagType >
-  inline static
-  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  execute_impl(const FunctorType& f, const PolicyType& p, PointerType
-result_ptr)
-    {
-      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget
-parallel_for");
-      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget
-parallel_for"); const typename PolicyType::member_type begin = p.begin(); const
-typename PolicyType::member_type end = p.end();
-
-      ValueType result = ValueType();
-      #pragma omp target teams distribute parallel for num_teams(512) map(to:f)
-map(tofrom: result) reduction(custom: result) for(int i=begin; i<end; i++)
-        f(TagType(),i,result);
-
-      *result_ptr=result;
-    }
-
-
-    inline static
-    void execute(const FunctorType& f, const PolicyType& p, PointerType ptr) {
-      execute_impl<typename PolicyType::work_tag>(f,p,ptr);
-    }
-};
-
-
 template <class FunctorType, class ReducerType, class... Traits>
 class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                      Kokkos::Experimental::OpenMPTarget> {
@@ -765,42 +445,38 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   using Policy = Kokkos::MDRangePolicy<Traits...>;
 
   using WorkTag = typename Policy::work_tag;
-  using WorkRange = typename Policy::WorkRange;
-  using Member = typename Policy::member_type;
+  using Member  = typename Policy::member_type;
 
   using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
+      std::conditional<std::is_same<InvalidType, ReducerType>::value,
+                       FunctorType, ReducerType>;
   using ReducerTypeFwd = typename ReducerConditional::type;
   using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                               WorkTag, void>::type;
-
-  // Static Assert WorkTag void if ReducerType not InvalidType
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
+                         void>;
 
   using ValueTraits =
       Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
-  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
-
-  enum { HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
-  enum { UseReducer = is_reducer_type<ReducerType>::value };
 
-  using pointer_type = typename ValueTraits::pointer_type;
+  using pointer_type   = typename ValueTraits::pointer_type;
   using reference_type = typename ValueTraits::reference_type;
 
-  using ParForSpecialize = ParallelReduceSpecialize<
-      FunctorType, Policy, ReducerType, pointer_type,
-      typename ValueTraits::value_type, HasJoin, UseReducer>;
+  enum { HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
+  enum { UseReducer = is_reducer_type<ReducerType>::value };
 
+  const pointer_type m_result_ptr;
   const FunctorType m_functor;
   const Policy m_policy;
   const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
+
+  using ParReduceCommon = ParallelReduceCommon<pointer_type>;
+
+  bool m_result_ptr_on_device;
 
  public:
   inline void execute() const {
-    ParForSpecialize::execute(m_functor, m_policy, m_result_ptr);
+    execute_tile<Policy::rank, typename ValueTraits::value_type>(
+        m_functor, m_policy, m_result_ptr);
   }
 
   template <class ViewType>
@@ -810,35 +486,345 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
       typename std::enable_if<Kokkos::is_view<ViewType>::value &&
                                   !Kokkos::is_reducer_type<ReducerType>::value,
                               void*>::type = NULL)
-      : m_functor(arg_functor),
+      : m_result_ptr(arg_result_view.data()),
+        m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(InvalidType()),
-        m_result_ptr(arg_result_view.data()) {
-    //static_assert( std::is_same< typename ViewType::memory_space
-    //                                , Kokkos::HostSpace >::value
-    //  , "Reduction result on Kokkos::Experimental::OpenMPTarget must be a
-    //  Kokkos::View in HostSpace" );
-  }
+        m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ViewType::memory_space>::accessible) {}
 
   inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
                         const ReducerType& reducer)
-      : m_functor(arg_functor),
+      : m_result_ptr(reducer.view().data()),
+        m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {
-    //static_assert( std::is_same< typename ViewType::memory_space
-    //                                , Kokkos::HostSpace >::value
-    //  , "Reduction result on Kokkos::Experimental::OpenMPTarget must be a
-    //  Kokkos::View in HostSpace" );
+        m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible) {}
+
+  template <int Rank, class ValueType>
+  inline typename std::enable_if<Rank == 2>::type execute_tile(
+      const FunctorType& functor, const Policy& policy,
+      pointer_type ptr) const {
+    const auto begin_0 = policy.m_lower[0];
+    const auto begin_1 = policy.m_lower[1];
+
+    const auto end_0 = policy.m_upper[0];
+    const auto end_1 = policy.m_upper[1];
+
+    ValueType result = ValueType();
+
+    // FIXME_OPENMPTARGET: Unable to separate directives and their companion
+    // loops which leads to code duplication for different reduction types.
+    if constexpr (UseReducer) {
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp target teams distribute parallel for collapse(2) map(to         \
+                                                                 : functor) \
+    reduction(custom                                                        \
+              : result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+            functor(i0, i1, result);
+          else
+            functor(typename Policy::work_tag(), i0, i1, result);
+        }
+      }
+    } else {
+#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \
+reduction(+:result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+            functor(i0, i1, result);
+          else
+            functor(typename Policy::work_tag(), i0, i1, result);
+        }
+      }
+    }
+
+    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
+                                   m_result_ptr_on_device);
   }
-  // TODO DZP: based on a conversation with Christian, we're using 256 as a
-heuristic
-  // here. We need something better once we can query these kinds of properties
-  template<typename Policy, typename Functor>
-static int max_tile_size_product(const Policy&, const Functor&) {
+
+  template <int Rank, class ValueType>
+  inline typename std::enable_if<Rank == 3>::type execute_tile(
+      const FunctorType& functor, const Policy& policy,
+      pointer_type ptr) const {
+    const auto begin_0 = policy.m_lower[0];
+    const auto begin_1 = policy.m_lower[1];
+    const auto begin_2 = policy.m_lower[2];
+
+    const auto end_0 = policy.m_upper[0];
+    const auto end_1 = policy.m_upper[1];
+    const auto end_2 = policy.m_upper[2];
+
+    ValueType result = ValueType();
+
+    // FIXME_OPENMPTARGET: Unable to separate directives and their companion
+    // loops which leads to code duplication for different reduction types.
+    if constexpr (UseReducer) {
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp target teams distribute parallel for collapse(3) map(to         \
+                                                                 : functor) \
+    reduction(custom                                                        \
+              : result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+              functor(i0, i1, i2, result);
+            else
+              functor(typename Policy::work_tag(), i0, i1, i2, result);
+          }
+        }
+      }
+    } else {
+#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \
+reduction(+:result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+              functor(i0, i1, i2, result);
+            else
+              functor(typename Policy::work_tag(), i0, i1, i2, result);
+          }
+        }
+      }
+    }
+
+    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
+                                   m_result_ptr_on_device);
+  }
+
+  template <int Rank, class ValueType>
+  inline typename std::enable_if<Rank == 4>::type execute_tile(
+      const FunctorType& functor, const Policy& policy,
+      pointer_type ptr) const {
+    const auto begin_0 = policy.m_lower[0];
+    const auto begin_1 = policy.m_lower[1];
+    const auto begin_2 = policy.m_lower[3];
+    const auto begin_3 = policy.m_lower[2];
+
+    const auto end_0 = policy.m_upper[0];
+    const auto end_1 = policy.m_upper[1];
+    const auto end_2 = policy.m_upper[2];
+    const auto end_3 = policy.m_upper[3];
+
+    ValueType result = ValueType();
+
+    // FIXME_OPENMPTARGET: Unable to separate directives and their companion
+    // loops which leads to code duplication for different reduction types.
+    if constexpr (UseReducer) {
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp target teams distribute parallel for collapse(4) map(to         \
+                                                                 : functor) \
+    reduction(custom                                                        \
+              : result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            for (auto i3 = begin_3; i3 < end_3; ++i3) {
+              if constexpr (std::is_same<typename Policy::work_tag,
+                                         void>::value)
+                functor(i0, i1, i2, i3, result);
+              else
+                functor(typename Policy::work_tag(), i0, i1, i2, i3, result);
+            }
+          }
+        }
+      }
+    } else {
+#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \
+reduction(+:result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            for (auto i3 = begin_3; i3 < end_3; ++i3) {
+              if constexpr (std::is_same<typename Policy::work_tag,
+                                         void>::value)
+                functor(i0, i1, i2, i3, result);
+              else
+                functor(typename Policy::work_tag(), i0, i1, i2, i3, result);
+            }
+          }
+        }
+      }
+    }
+
+    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
+                                   m_result_ptr_on_device);
+  }
+
+  template <int Rank, class ValueType>
+  inline typename std::enable_if<Rank == 5>::type execute_tile(
+      const FunctorType& functor, const Policy& policy,
+      pointer_type ptr) const {
+    const auto begin_0 = policy.m_lower[0];
+    const auto begin_1 = policy.m_lower[1];
+    const auto begin_2 = policy.m_lower[2];
+    const auto begin_3 = policy.m_lower[3];
+    const auto begin_4 = policy.m_lower[4];
+
+    const auto end_0 = policy.m_upper[0];
+    const auto end_1 = policy.m_upper[1];
+    const auto end_2 = policy.m_upper[2];
+    const auto end_3 = policy.m_upper[3];
+    const auto end_4 = policy.m_upper[4];
+
+    ValueType result = ValueType();
+
+    // FIXME_OPENMPTARGET: Unable to separate directives and their companion
+    // loops which leads to code duplication for different reduction types.
+    if constexpr (UseReducer) {
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp target teams distribute parallel for collapse(5) map(to         \
+                                                                 : functor) \
+    reduction(custom                                                        \
+              : result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            for (auto i3 = begin_3; i3 < end_3; ++i3) {
+              for (auto i4 = begin_4; i4 < end_4; ++i4) {
+                if constexpr (std::is_same<typename Policy::work_tag,
+                                           void>::value)
+                  functor(i0, i1, i2, i3, i4, result);
+                else
+                  functor(typename Policy::work_tag(), i0, i1, i2, i3, i4,
+                          result);
+              }
+            }
+          }
+        }
+      }
+    } else {
+#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \
+reduction(+:result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            for (auto i3 = begin_3; i3 < end_3; ++i3) {
+              for (auto i4 = begin_4; i4 < end_4; ++i4) {
+                if constexpr (std::is_same<typename Policy::work_tag,
+                                           void>::value)
+                  functor(i0, i1, i2, i3, i4, result);
+                else
+                  functor(typename Policy::work_tag(), i0, i1, i2, i3, i4,
+                          result);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
+                                   m_result_ptr_on_device);
+  }
+
+  template <int Rank, class ValueType>
+  inline typename std::enable_if<Rank == 6>::type execute_tile(
+      const FunctorType& functor, const Policy& policy,
+      pointer_type ptr) const {
+    const auto begin_0 = policy.m_lower[0];
+    const auto begin_1 = policy.m_lower[1];
+    const auto begin_2 = policy.m_lower[2];
+    const auto begin_3 = policy.m_lower[3];
+    const auto begin_4 = policy.m_lower[4];
+    const auto begin_5 = policy.m_lower[5];
+
+    const auto end_0 = policy.m_upper[0];
+    const auto end_1 = policy.m_upper[1];
+    const auto end_2 = policy.m_upper[2];
+    const auto end_3 = policy.m_upper[3];
+    const auto end_4 = policy.m_upper[4];
+    const auto end_5 = policy.m_upper[5];
+
+    ValueType result = ValueType();
+
+    // FIXME_OPENMPTARGET: Unable to separate directives and their companion
+    // loops which leads to code duplication for different reduction types.
+    if constexpr (UseReducer) {
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp target teams distribute parallel for collapse(6) map(to         \
+                                                                 : functor) \
+    reduction(custom                                                        \
+              : result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            for (auto i3 = begin_3; i3 < end_3; ++i3) {
+              for (auto i4 = begin_4; i4 < end_4; ++i4) {
+                for (auto i5 = begin_5; i5 < end_5; ++i5) {
+                  if constexpr (std::is_same<typename Policy::work_tag,
+                                             void>::value)
+                    functor(i0, i1, i2, i3, i4, i5, result);
+                  else
+                    functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5,
+                            result);
+                }
+              }
+            }
+          }
+        }
+      }
+    } else {
+#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \
+reduction(+:result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            for (auto i3 = begin_3; i3 < end_3; ++i3) {
+              for (auto i4 = begin_4; i4 < end_4; ++i4) {
+                for (auto i5 = begin_5; i5 < end_5; ++i5) {
+                  if constexpr (std::is_same<typename Policy::work_tag,
+                                             void>::value)
+                    functor(i0, i1, i2, i3, i4, i5, result);
+                  else
+                    functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5,
+                            result);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
+                                   m_result_ptr_on_device);
+  }
+
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy&, const Functor&) {
     return 256;
   }
-};*/
+};
 
 }  // namespace Impl
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
index be924ffa61c1f8cf696b3b84cb44765536fde4f9..0e71a239caf343d77f6ed05ff02bb2e45ca64efd 100644
--- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
+++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
@@ -112,35 +112,11 @@ void TaskExec<Kokkos::Experimental::OpenMPTarget>::team_barrier_impl() const {
   // This team member sets one byte within the sync variable
   int8_t volatile *const sync_self = ((int8_t *)sync) + m_team_rank;
 
-#if 0
-fprintf( stdout
-       , "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n"
-       , m_group_rank
-       , m_team_rank
-       , m_sync_step
-       , m_sync_value
-       , *sync
-       );
-fflush(stdout);
-#endif
-
   *sync_self = int8_t(m_sync_value & 0x03);  // signal arrival
 
   while (m_sync_value != *sync)
     ;  // wait for team to arrive
 
-#if 0
-fprintf( stdout
-       , "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n"
-       , m_group_rank
-       , m_team_rank
-       , m_sync_step
-       , m_sync_value
-       , *sync
-       );
-fflush(stdout);
-#endif
-
   ++m_sync_step;
 
   if (0 == (0x01 & m_sync_step)) {  // Every other step
@@ -222,17 +198,6 @@ void TaskQueueSpecialization<Kokkos::Experimental::OpenMPTarget>::execute(
         task = *task_shared;
       }
 
-#if 0
-fprintf( stdout
-       , "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n"
-       , team_exec.m_group_rank
-       , team_exec.m_team_rank
-       , uintptr_t(task_shared)
-       , uintptr_t(task)
-       );
-fflush(stdout);
-#endif
-
       if (0 == task) break;  // 0 == m_ready_count
 
       if (end == task) {
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
index 3a09ee919540b93c74bcd5f2e7eea57b352575a7..18d33317a29819274037085b6a69e9239797f1a8 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
@@ -112,14 +112,36 @@ void SYCL::print_configuration(std::ostream& s, const bool detailed) {
 }
 
 void SYCL::fence() const {
-  Impl::SYCLInternal::fence(*m_space_instance->m_queue);
+  fence("Kokkos::Experimental::SYCL::fence: Unnamed Instance Fence");
+}
+void SYCL::fence(const std::string& name) const {
+  Impl::SYCLInternal::fence(*m_space_instance->m_queue, name,
+                            impl_instance_id());
 }
 
 void SYCL::impl_static_fence() {
-  // guard accessing all_queues
-  std::lock_guard<std::mutex> lock(Impl::SYCLInternal::mutex);
-  for (auto& queue : Impl::SYCLInternal::all_queues)
-    Impl::SYCLInternal::fence(**queue);
+  impl_static_fence(
+      "Kokkos::Experimental::SYCL::fence: Unnamed Instance Fence");
+}
+void SYCL::impl_static_fence(const std::string& name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::SYCL>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      [&]() {
+        // guard accessing all_queues
+        std::lock_guard<std::mutex> lock(Impl::SYCLInternal::mutex);
+        for (auto& queue : Impl::SYCLInternal::all_queues) {
+          try {
+            (*queue)->wait_and_throw();
+          } catch (sycl::exception const& e) {
+            Kokkos::Impl::throw_runtime_exception(
+                std::string("There was a synchronous SYCL error:\n") +=
+                e.what());
+          }
+        }
+      });
 }
 
 int SYCL::sycl_device() const {
@@ -224,10 +246,6 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os,
             << device.get_info<device::global_mem_cache_size>()
             << "\nGlobal Mem Size: "
             << device.get_info<device::global_mem_size>()
-            << "\nMax Constant Buffer Size: "
-            << device.get_info<device::max_constant_buffer_size>()
-            << "\nMax Constant Args: "
-            << device.get_info<device::max_constant_args>()
             << "\nLocal Mem Size: " << device.get_info<device::local_mem_size>()
             << "\nError Correction Support: "
             << device.get_info<device::error_correction_support>()
@@ -296,6 +314,9 @@ void SYCLSpaceInitializer::finalize(const bool all_spaces) {
 void SYCLSpaceInitializer::fence() {
   Kokkos::Experimental::SYCL::impl_static_fence();
 }
+void SYCLSpaceInitializer::fence(const std::string& name) {
+  Kokkos::Experimental::SYCL::impl_static_fence(name);
+}
 
 void SYCLSpaceInitializer::print_configuration(std::ostream& msg,
                                                const bool detail) {
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp
index aef65ee7ecbbf3c39432b42a42b595dbfe00b239..3eeab5636342031955920c81e807b218d662f3b8 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp
@@ -48,181 +48,144 @@
 #include <Kokkos_Core_fwd.hpp>
 #include <Kokkos_SYCL.hpp>
 
+#include <vector>
+
 #ifdef KOKKOS_ENABLE_SYCL
 
 namespace Kokkos {
 namespace Impl {
 
-template <>
-struct DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                Kokkos::Experimental::SYCLDeviceUSMSpace,
-                Kokkos::Experimental::SYCL> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::SYCL&, void* dst, const void* src,
-           size_t);
-};
-
-template <>
-struct DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-                Kokkos::Experimental::SYCL> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::SYCL&, void* dst, const void* src,
-           size_t);
-};
-
-template <>
-struct DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
-                Kokkos::Experimental::SYCL> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::SYCL&, void* dst, const void* src,
-           size_t);
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                Kokkos::Experimental::SYCLDeviceUSMSpace, ExecutionSpace> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                   Kokkos::Experimental::SYCLDeviceUSMSpace,
-                   Kokkos::Experimental::SYCL>(dst, src, n);
+template <class DT, class... DP>
+struct ZeroMemset<Kokkos::Experimental::SYCL, DT, DP...> {
+  ZeroMemset(const Kokkos::Experimental::SYCL& exec_space,
+             const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    auto event = exec_space.impl_internal_space_instance()->m_queue->memset(
+        dst.data(), 0,
+        dst.size() * sizeof(typename View<DT, DP...>::value_type));
+    exec_space.impl_internal_space_instance()->m_queue->submit_barrier(
+        std::vector<sycl::event>{event});
   }
 
-  DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
-    DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-             Kokkos::Experimental::SYCLDeviceUSMSpace,
-             Kokkos::Experimental::SYCL>(Kokkos::Experimental::SYCL(), dst, src,
-                                         n);
-    Kokkos::Experimental::SYCL().fence();
+  ZeroMemset(const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    Experimental::Impl::SYCLInternal::singleton().m_queue->memset(
+        dst.data(), 0,
+        dst.size() * sizeof(typename View<DT, DP...>::value_type));
   }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-                ExecutionSpace> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-                   Kokkos::Experimental::SYCL>(dst, src, n);
-  }
+void DeepCopySYCL(void* dst, const void* src, size_t n);
+void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst,
+                       const void* src, size_t n);
+void DeepCopyAsyncSYCL(void* dst, const void* src, size_t n);
 
-  DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
-    DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-             Kokkos::Experimental::SYCL>(Kokkos::Experimental::SYCL(), dst, src,
-                                         n);
-    Kokkos::Experimental::SYCL().fence();
+template <class MemSpace>
+struct DeepCopy<MemSpace, HostSpace, Kokkos::Experimental::SYCL,
+                std::enable_if_t<is_sycl_type_space<MemSpace>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); }
+  DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst,
+           const void* src, size_t n) {
+    DeepCopyAsyncSYCL(instance, dst, src, n);
   }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
-                ExecutionSpace> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
-                   Kokkos::Experimental::SYCL>(dst, src, n);
-  }
-
-  DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
-    DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
-             Kokkos::Experimental::SYCL>(Kokkos::Experimental::SYCL(), dst, src,
-                                         n);
-    Kokkos::Experimental::SYCL().fence();
+template <class MemSpace>
+struct DeepCopy<HostSpace, MemSpace, Kokkos::Experimental::SYCL,
+                std::enable_if_t<is_sycl_type_space<MemSpace>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); }
+  DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst,
+           const void* src, size_t n) {
+    DeepCopyAsyncSYCL(instance, dst, src, n);
   }
 };
 
-template <>
-struct DeepCopy<Experimental::SYCLSharedUSMSpace,
-                Experimental::SYCLSharedUSMSpace, Kokkos::Experimental::SYCL>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                      Experimental::SYCLDeviceUSMSpace,
-                      Kokkos::Experimental::SYCL> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                 Experimental::SYCLDeviceUSMSpace,
-                 Kokkos::Experimental::SYCL>::DeepCopy;
+template <class MemSpace1, class MemSpace2>
+struct DeepCopy<MemSpace1, MemSpace2, Kokkos::Experimental::SYCL,
+                std::enable_if_t<is_sycl_type_space<MemSpace1>::value &&
+                                 is_sycl_type_space<MemSpace2>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); }
+  DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst,
+           const void* src, size_t n) {
+    DeepCopyAsyncSYCL(instance, dst, src, n);
+  }
 };
 
-template <>
-struct DeepCopy<Experimental::SYCLSharedUSMSpace, HostSpace,
-                Kokkos::Experimental::SYCL>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace, HostSpace,
-                      Kokkos::Experimental::SYCL> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace, HostSpace,
-                 Kokkos::Experimental::SYCL>::DeepCopy;
-};
+template <class MemSpace1, class MemSpace2, class ExecutionSpace>
+struct DeepCopy<
+    MemSpace1, MemSpace2, ExecutionSpace,
+    std::enable_if_t<
+        is_sycl_type_space<MemSpace1>::value &&
+        is_sycl_type_space<MemSpace2>::value &&
+        !std::is_same<ExecutionSpace, Kokkos::Experimental::SYCL>::value>> {
+  inline DeepCopy(void* dst, const void* src, size_t n) {
+    DeepCopySYCL(dst, src, n);
+  }
 
-template <>
-struct DeepCopy<HostSpace, Experimental::SYCLSharedUSMSpace,
-                Kokkos::Experimental::SYCL>
-    : public DeepCopy<HostSpace, Experimental::SYCLDeviceUSMSpace,
-                      Kokkos::Experimental::SYCL> {
-  using DeepCopy<HostSpace, Experimental::SYCLDeviceUSMSpace,
-                 Kokkos::Experimental::SYCL>::DeepCopy;
-};
+  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
+                  size_t n) {
+    exec.fence(fence_string());
+    DeepCopyAsyncSYCL(dst, src, n);
+  }
 
-template <>
-struct DeepCopy<Experimental::SYCLSharedUSMSpace,
-                Experimental::SYCLDeviceUSMSpace, Kokkos::Experimental::SYCL>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                      Experimental::SYCLDeviceUSMSpace,
-                      Kokkos::Experimental::SYCL> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                 Experimental::SYCLDeviceUSMSpace,
-                 Kokkos::Experimental::SYCL>::DeepCopy;
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<") + MemSpace1::name() + "Space, " +
+        MemSpace2::name() +
+        "Space, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
+  }
 };
 
-template <>
-struct DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                Experimental::SYCLSharedUSMSpace, Kokkos::Experimental::SYCL>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                      Experimental::SYCLDeviceUSMSpace,
-                      Kokkos::Experimental::SYCL> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                 Experimental::SYCLDeviceUSMSpace,
-                 Kokkos::Experimental::SYCL>::DeepCopy;
-};
+template <class MemSpace, class ExecutionSpace>
+struct DeepCopy<
+    MemSpace, HostSpace, ExecutionSpace,
+    std::enable_if_t<
+        is_sycl_type_space<MemSpace>::value &&
+        !std::is_same<ExecutionSpace, Kokkos::Experimental::SYCL>::value>> {
+  inline DeepCopy(void* dst, const void* src, size_t n) {
+    DeepCopySYCL(dst, src, n);
+  }
 
-template <class ExecutionSpace>
-struct DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                Experimental::SYCLSharedUSMSpace, ExecutionSpace>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                      Experimental::SYCLDeviceUSMSpace, ExecutionSpace> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                 Experimental::SYCLDeviceUSMSpace, ExecutionSpace>::DeepCopy;
-};
+  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
+                  size_t n) {
+    exec.fence(fence_string());
+    DeepCopyAsyncSYCL(dst, src, n);
+  }
 
-template <class ExecutionSpace>
-struct DeepCopy<Experimental::SYCLSharedUSMSpace,
-                Experimental::SYCLDeviceUSMSpace, ExecutionSpace>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                      Experimental::SYCLDeviceUSMSpace, ExecutionSpace> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                 Experimental::SYCLDeviceUSMSpace, ExecutionSpace>::DeepCopy;
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<") + MemSpace::name() +
+        "Space, HostSpace, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
+  }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<Experimental::SYCLSharedUSMSpace,
-                Experimental::SYCLSharedUSMSpace, ExecutionSpace>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                      Experimental::SYCLDeviceUSMSpace, ExecutionSpace> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                 Experimental::SYCLDeviceUSMSpace, ExecutionSpace>::DeepCopy;
-};
+template <class MemSpace, class ExecutionSpace>
+struct DeepCopy<
+    HostSpace, MemSpace, ExecutionSpace,
+    std::enable_if_t<
+        is_sycl_type_space<MemSpace>::value &&
+        !std::is_same<ExecutionSpace, Kokkos::Experimental::SYCL>::value>> {
+  inline DeepCopy(void* dst, const void* src, size_t n) {
+    DeepCopySYCL(dst, src, n);
+  }
 
-template <class ExecutionSpace>
-struct DeepCopy<Experimental::SYCLSharedUSMSpace, HostSpace, ExecutionSpace>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace, HostSpace,
-                      ExecutionSpace> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace, HostSpace,
-                 ExecutionSpace>::DeepCopy;
-};
+  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
+                  size_t n) {
+    exec.fence(fence_string());
+    DeepCopyAsyncSYCL(dst, src, n);
+  }
 
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, Experimental::SYCLSharedUSMSpace, ExecutionSpace>
-    : public DeepCopy<HostSpace, Experimental::SYCLDeviceUSMSpace,
-                      ExecutionSpace> {
-  using DeepCopy<HostSpace, Experimental::SYCLDeviceUSMSpace,
-                 ExecutionSpace>::DeepCopy;
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<HostSpace, ") + MemSpace::name() +
+        "Space, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
+  }
 };
 
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp
index 5a702b5027277cc7137cba9bba72e7367e9ae97b..816b42038ed0bb1605d005375bd39d2de4e3d69d 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp
@@ -42,14 +42,7 @@
 //@HEADER
 */
 
-#include <Kokkos_Concepts.hpp>
-#include <SYCL/Kokkos_SYCL_Instance.hpp>
-#include <KokkosCore_Config_DeclareBackend.hpp>
-#include <Kokkos_SYCL.hpp>
-#include <Kokkos_HostSpace.hpp>
-#include <Kokkos_Serial.hpp>
-#include <impl/Kokkos_ConcurrentBitset.hpp>
-#include <impl/Kokkos_Error.hpp>
+#include <Kokkos_Core.hpp>  //kokkos_malloc
 
 namespace Kokkos {
 namespace Experimental {
@@ -122,7 +115,6 @@ void SYCLInternal::initialize(const sycl::queue& q) {
       all_queues.push_back(&m_queue);
     }
     const sycl::device& d = m_queue->get_device();
-    std::cout << SYCL::SYCLDevice(d) << '\n';
 
     m_maxWorkgroupSize =
         d.template get_info<sycl::info::device::max_work_group_size>();
@@ -140,19 +132,22 @@ void SYCLInternal::initialize(const sycl::queue& q) {
           Kokkos::Experimental::SYCLDeviceUSMSpace, void>;
       Record* const r =
           Record::allocate(Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue),
-                           "Kokkos::SYCL::InternalScratchBitset",
+                           "Kokkos::Experimental::SYCL::InternalScratchBitset",
                            sizeof(uint32_t) * buffer_bound);
       Record::increment(r);
       m_scratchConcurrentBitset = reinterpret_cast<uint32_t*>(r->data());
       auto event                = m_queue->memset(m_scratchConcurrentBitset, 0,
                                    sizeof(uint32_t) * buffer_bound);
-      fence(event);
+      fence(event,
+            "Kokkos::Experimental::SYCLInternal::initialize: fence after "
+            "initializing m_scratchConcurrentBitset",
+            m_instance_id);
     }
 
     m_maxShmemPerBlock =
         d.template get_info<sycl::info::device::local_mem_size>();
-    m_indirectKernelMem.reset(*m_queue);
-    m_indirectReducerMem.reset(*m_queue);
+    m_indirectKernelMem.reset(*m_queue, m_instance_id);
+    m_indirectReducerMem.reset(*m_queue, m_instance_id);
   } else {
     std::ostringstream msg;
     msg << "Kokkos::Experimental::SYCL::initialize(...) FAILED";
@@ -162,10 +157,36 @@ void SYCLInternal::initialize(const sycl::queue& q) {
     }
     Kokkos::Impl::throw_runtime_exception(msg.str());
   }
+
+  m_team_scratch_current_size = 0;
+  m_team_scratch_ptr          = nullptr;
+}
+
+void* SYCLInternal::resize_team_scratch_space(std::int64_t bytes,
+                                              bool force_shrink) {
+  if (m_team_scratch_current_size == 0) {
+    m_team_scratch_current_size = bytes;
+    m_team_scratch_ptr =
+        Kokkos::kokkos_malloc<Experimental::SYCLDeviceUSMSpace>(
+            "Kokkos::Experimental::SYCLDeviceUSMSpace::TeamScratchMemory",
+            m_team_scratch_current_size);
+  }
+  if ((bytes > m_team_scratch_current_size) ||
+      ((bytes < m_team_scratch_current_size) && (force_shrink))) {
+    m_team_scratch_current_size = bytes;
+    m_team_scratch_ptr =
+        Kokkos::kokkos_realloc<Experimental::SYCLDeviceUSMSpace>(
+            m_team_scratch_ptr, m_team_scratch_current_size);
+  }
+  return m_team_scratch_ptr;
 }
 
+uint32_t SYCLInternal::impl_get_instance_id() const { return m_instance_id; }
+
 void SYCLInternal::finalize() {
-  SYCL().fence();
+  SYCLInternal::fence(*m_queue,
+                      "Kokkos::SYCLInternal::finalize: fence on finalization",
+                      m_instance_id);
   was_finalized = true;
 
   using RecordSYCL = Kokkos::Impl::SharedAllocationRecord<SYCLDeviceUSMSpace>;
@@ -182,6 +203,12 @@ void SYCLInternal::finalize() {
   RecordSYCL::decrement(RecordSYCL::get_record(m_scratchConcurrentBitset));
   m_scratchConcurrentBitset = nullptr;
 
+  if (m_team_scratch_current_size > 0)
+    Kokkos::kokkos_free<Kokkos::Experimental::SYCLDeviceUSMSpace>(
+        m_team_scratch_ptr);
+  m_team_scratch_current_size = 0;
+  m_team_scratch_ptr          = nullptr;
+
   m_indirectKernelMem.reset();
   m_indirectReducerMem.reset();
   // guard erasing from all_queues
@@ -208,7 +235,7 @@ void* SYCLInternal::scratch_space(
 
     Record* const r =
         Record::allocate(Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue),
-                         "Kokkos::SYCL::InternalScratchSpace",
+                         "Kokkos::Experimental::SYCL::InternalScratchSpace",
                          (sizeScratchGrain * m_scratchSpaceCount));
 
     Record::increment(r);
@@ -235,7 +262,7 @@ void* SYCLInternal::scratch_flags(
 
     Record* const r =
         Record::allocate(Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue),
-                         "Kokkos::SYCL::InternalScratchFlags",
+                         "Kokkos::Experimental::SYCL::InternalScratchFlags",
                          (sizeScratchGrain * m_scratchFlagsCount));
 
     Record::increment(r);
@@ -243,14 +270,38 @@ void* SYCLInternal::scratch_flags(
     m_scratchFlags = reinterpret_cast<size_type*>(r->data());
   }
   m_queue->memset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain);
-  fence(*m_queue);
+  fence(*m_queue,
+        "Kokkos::Experimental::SYCLInternal::scratch_flags fence after "
+        "initializing m_scratchFlags",
+        m_instance_id);
 
   return m_scratchFlags;
 }
 
+template <typename WAT>
+void SYCLInternal::fence_helper(WAT& wat, const std::string& name,
+                                uint32_t instance_id) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::SYCL>(
+      name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{instance_id},
+      [&]() {
+        try {
+          wat.wait_and_throw();
+        } catch (sycl::exception const& e) {
+          Kokkos::Impl::throw_runtime_exception(
+              std::string("There was a synchronous SYCL error:\n") += e.what());
+        }
+      });
+}
+template void SYCLInternal::fence_helper<sycl::queue>(sycl::queue&,
+                                                      const std::string&,
+                                                      uint32_t);
+template void SYCLInternal::fence_helper<sycl::event>(sycl::event&,
+                                                      const std::string&,
+                                                      uint32_t);
+
 template <sycl::usm::alloc Kind>
 size_t SYCLInternal::USMObjectMem<Kind>::reserve(size_t n) {
-  assert(m_size == 0);
   assert(m_q);
 
   if (m_capacity < n) {
@@ -258,8 +309,8 @@ size_t SYCLInternal::USMObjectMem<Kind>::reserve(size_t n) {
     // First free what we have (in case malloc can reuse it)
     if (m_data) Record::decrement(Record::get_record(m_data));
 
-    Record* const r = Record::allocate(AllocationSpace(*m_q),
-                                       "Kokkos::SYCL::USMObjectMem", n);
+    Record* const r = Record::allocate(
+        AllocationSpace(*m_q), "Kokkos::Experimental::SYCL::USMObjectMem", n);
     Record::increment(r);
 
     m_data     = r->data();
@@ -271,9 +322,9 @@ size_t SYCLInternal::USMObjectMem<Kind>::reserve(size_t n) {
 
 template <sycl::usm::alloc Kind>
 void SYCLInternal::USMObjectMem<Kind>::reset() {
-  assert(m_size == 0);
-
   if (m_data) {
+    // This implies a fence since this class is not copyable
+    // and deallocating implies a fence across all registered queues.
     using Record = Kokkos::Impl::SharedAllocationRecord<AllocationSpace, void>;
     Record::decrement(Record::get_record(m_data));
 
@@ -285,6 +336,7 @@ void SYCLInternal::USMObjectMem<Kind>::reset() {
 
 template class SYCLInternal::USMObjectMem<sycl::usm::alloc::shared>;
 template class SYCLInternal::USMObjectMem<sycl::usm::alloc::device>;
+template class SYCLInternal::USMObjectMem<sycl::usm::alloc::host>;
 
 }  // namespace Impl
 }  // namespace Experimental
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp
index e797411cd40bdd734c04d2a9b0e51151fa269ebd..bf4d6c5b459579213866f6dcb99332c6e641c3a1 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp
@@ -49,7 +49,7 @@
 #include <CL/sycl.hpp>
 
 #include <impl/Kokkos_Error.hpp>
-
+#include <impl/Kokkos_Profiling.hpp>
 namespace Kokkos {
 namespace Experimental {
 namespace Impl {
@@ -68,7 +68,10 @@ class SYCLInternal {
 
   void* scratch_space(const size_type size);
   void* scratch_flags(const size_type size);
+  void* resize_team_scratch_space(std::int64_t bytes,
+                                  bool force_shrink = false);
 
+  uint32_t impl_get_instance_id() const;
   int m_syclDev = -1;
 
   size_t m_maxWorkgroupSize   = 0;
@@ -81,6 +84,11 @@ class SYCLInternal {
   size_type m_scratchFlagsCount       = 0;
   size_type* m_scratchFlags           = nullptr;
 
+  int64_t m_team_scratch_current_size = 0;
+  void* m_team_scratch_ptr            = nullptr;
+
+  uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance<
+      Kokkos::Experimental::SYCL>(reinterpret_cast<uintptr_t>(this));
   std::optional<sycl::queue> m_queue;
 
   // Using std::vector<std::optional<sycl::queue>> reveals a compiler bug when
@@ -94,40 +102,16 @@ class SYCLInternal {
   template <sycl::usm::alloc Kind>
   class USMObjectMem {
    public:
-    class Deleter {
-     public:
-      Deleter() = default;
-      explicit Deleter(USMObjectMem* mem) : m_mem(mem) {}
-
-      template <typename T>
-      void operator()(T* p) const noexcept {
-        assert(m_mem);
-        assert(sizeof(T) == m_mem->size());
-
-        if constexpr (sycl::usm::alloc::device == kind)
-          // Only skipping the dtor on trivially copyable types
-          static_assert(std::is_trivially_copyable_v<T>);
-        else
-          p->~T();
-
-        m_mem->m_size = 0;
-      }
-
-     private:
-      USMObjectMem* m_mem = nullptr;
-    };
-
-    static constexpr sycl::usm::alloc kind = Kind;
-
     void reset();
 
-    void reset(sycl::queue q) {
+    void reset(sycl::queue q, uint32_t instance_id) {
+      m_instance_id = instance_id;
       reset();
       m_q.emplace(std::move(q));
     }
-
     USMObjectMem() = default;
-    explicit USMObjectMem(sycl::queue q) noexcept : m_q(std::move(q)) {}
+    explicit USMObjectMem(sycl::queue q, uint32_t instance_id) noexcept
+        : m_q(std::move(q)), m_instance_id(instance_id) {}
 
     USMObjectMem(USMObjectMem const&) = delete;
     USMObjectMem(USMObjectMem&&)      = delete;
@@ -139,7 +123,6 @@ class SYCLInternal {
     void* data() noexcept { return m_data; }
     const void* data() const noexcept { return m_data; }
 
-    size_t size() const noexcept { return m_size; }
     size_t capacity() const noexcept { return m_capacity; }
 
     // reserve() allocates space for at least n bytes
@@ -147,120 +130,68 @@ class SYCLInternal {
     size_t reserve(size_t n);
 
    private:
-    using AllocationSpace =
-        std::conditional_t<Kind == sycl::usm::alloc::device,
-                           Kokkos::Experimental::SYCLDeviceUSMSpace,
-                           Kokkos::Experimental::SYCLSharedUSMSpace>;
-
-    // This will memcpy an object T into memory held by this object
-    // returns: a T* to that object
-    //
-    // Note:  it is UB to dereference this pointer with an object that is
-    // not an implicit-lifetime nor trivially-copyable type, but presumably much
-    // faster because we can use USM device memory
-    template <typename T>
-    std::unique_ptr<T, Deleter> memcpy_from(const T& t) {
-      reserve(sizeof(T));
-      sycl::event memcopied = m_q->memcpy(m_data, std::addressof(t), sizeof(T));
-      fence(memcopied);
-
-      std::unique_ptr<T, Deleter> ptr(reinterpret_cast<T*>(m_data),
-                                      Deleter(this));
-      m_size = sizeof(T);
-      return ptr;
-    }
-
-    // This will copy-constuct an object T into memory held by this object
-    // returns: a unique_ptr<T, destruct_delete> that will call the
-    // destructor on the type when it goes out of scope.
-    //
-    // Note:  This will not work with USM device memory
-    template <typename T>
-    std::unique_ptr<T, Deleter> copy_construct_from(const T& t) {
-      static_assert(kind != sycl::usm::alloc::device,
-                    "Cannot copy construct into USM device memory");
-
-      reserve(sizeof(T));
-
-      std::unique_ptr<T, Deleter> ptr(new (m_data) T(t), Deleter(this));
-      m_size = sizeof(T);
-      return ptr;
-    }
+    using AllocationSpace = std::conditional_t<
+        Kind == sycl::usm::alloc::device,
+        Kokkos::Experimental::SYCLDeviceUSMSpace,
+        std::conditional_t<Kind == sycl::usm::alloc::shared,
+                           Kokkos::Experimental::SYCLSharedUSMSpace,
+                           Kokkos::Experimental::SYCLHostUSMSpace>>;
 
    public:
-    // Performs either memcpy (for USM device memory) and returns a T*
-    // (but is technically UB when dereferenced on an object that is not
-    // an implicit-lifetime nor trivially-copyable type
-    //
-    // or
-    //
-    // performs copy construction (for other USM memory types) and returns a
-    // unique_ptr<T, ...>
-    template <typename T>
-    std::unique_ptr<T, Deleter> copy_from(const T& t) {
-      if constexpr (sycl::usm::alloc::device == kind)
-        return memcpy_from(t);
-      else
-        return copy_construct_from(t);
-    }
-
-   private:
-    // Returns a reference to t (helpful when debugging)
+    // Performs either sycl::memcpy (for USM device memory) or std::memcpy
+    // (otherwise) and returns a reference to the copied object.
     template <typename T>
-    T& memcpy_to(T& t) {
-      assert(sizeof(T) == m_size);
-
-      sycl::event memcopied = m_q->memcpy(std::addressof(t), m_data, sizeof(T));
-      fence(memcopied);
-
-      return t;
+    T& copy_from(const T& t) {
+      fence();
+      reserve(sizeof(T));
+      if constexpr (sycl::usm::alloc::device == Kind) {
+        sycl::event memcopied =
+            m_q->memcpy(m_data, std::addressof(t), sizeof(T));
+        SYCLInternal::fence(
+            memcopied,
+            "Kokkos::Experimental::SYCLInternal::USMObject fence after copy",
+            m_instance_id);
+      } else
+        std::memcpy(m_data, std::addressof(t), sizeof(T));
+      return *reinterpret_cast<T*>(m_data);
     }
 
-    // Returns a reference to t (helpful when debugging)
-    template <typename T>
-    T& move_assign_to(T& t) {
-      static_assert(kind != sycl::usm::alloc::device,
-                    "Cannot move_assign_to from USM device memory");
-
-      assert(sizeof(T) == m_size);
-
-      t = std::move(*static_cast<T*>(m_data));
-
-      return t;
+    void fence() {
+      SYCLInternal::fence(
+          m_last_event,
+          "Kokkos::Experimental::SYCLInternal::USMObject fence to wait for "
+          "last event to finish",
+          m_instance_id);
     }
 
-   public:
-    // Returns a reference to t (helpful when debugging)
-    template <typename T>
-    T& transfer_to(T& t) {
-      if constexpr (sycl::usm::alloc::device == kind)
-        return memcpy_to(t);
-      else
-        return move_assign_to(t);
+    void register_event(sycl::event event) {
+      assert(m_last_event
+                 .get_info<sycl::info::event::command_execution_status>() ==
+             sycl::info::event_command_status::complete);
+      m_last_event = event;
     }
 
    private:
     // USMObjectMem class invariants
     // All four expressions below must evaluate to true:
     //
-    //  !m_data == !m_capacity
-    //  m_q || !m_data
-    //  m_data || !m_size
-    //  m_size <= m_capacity
+    //  !m_data == (m_capacity == 0)
+    //      m_q || !m_data
     //
     //  The above invariants mean that:
-    //  if m_size != 0 then m_data != 0
-    //  if m_data != 0 then m_capacity != 0 && m_q != nullopt
-    //  if m_data == 0 then m_capacity == 0
+    //  if m_data != nullptr then m_capacity != 0 && m_q != nullopt
+    //  if m_data == nullptr then m_capacity == 0
 
     std::optional<sycl::queue> m_q;
     void* m_data      = nullptr;
-    size_t m_size     = 0;  // sizeof(T) iff m_data points to live T
     size_t m_capacity = 0;
+    sycl::event m_last_event;
+
+    uint32_t m_instance_id;
   };
 
   // An indirect kernel is one where the functor to be executed is explicitly
-  // copied to USM device memory before being executed, to get around the
+  // copied to USM memory before being executed, to get around the
   // trivially copyable limitation of SYCL.
   using IndirectKernelMem = USMObjectMem<sycl::usm::alloc::shared>;
   IndirectKernelMem m_indirectKernelMem;
@@ -286,18 +217,18 @@ class SYCLInternal {
   // fence(...) takes any type with a .wait_and_throw() method
   // (sycl::event and sycl::queue)
   template <typename WAT>
-  static void fence_helper(WAT& wat) {
-    try {
-      wat.wait_and_throw();
-    } catch (sycl::exception const& e) {
-      Kokkos::Impl::throw_runtime_exception(
-          std::string("There was a synchronous SYCL error:\n") += e.what());
-    }
-  }
+  static void fence_helper(WAT& wat, const std::string& name,
+                           uint32_t instance_id);
 
  public:
-  static void fence(sycl::queue& q) { fence_helper(q); }
-  static void fence(sycl::event& e) { fence_helper(e); }
+  static void fence(sycl::queue& q, const std::string& name,
+                    uint32_t instance_id) {
+    fence_helper(q, name, instance_id);
+  }
+  static void fence(sycl::event& e, const std::string& name,
+                    uint32_t instance_id) {
+    fence_helper(e, name, instance_id);
+  }
 };
 
 template <typename Functor, typename Storage,
@@ -312,20 +243,24 @@ class SYCLFunctionWrapper<Functor, Storage, true> {
   SYCLFunctionWrapper(const Functor& functor, Storage&) : m_functor(functor) {}
 
   const Functor& get_functor() const { return m_functor; }
+
+  static void register_event(Storage&, sycl::event){};
 };
 
 template <typename Functor, typename Storage>
 class SYCLFunctionWrapper<Functor, Storage, false> {
-  std::unique_ptr<Functor,
-                  Experimental::Impl::SYCLInternal::IndirectKernelMem::Deleter>
-      m_kernelFunctorPtr;
+  const Functor& m_kernelFunctor;
 
  public:
   SYCLFunctionWrapper(const Functor& functor, Storage& storage)
-      : m_kernelFunctorPtr(storage.copy_from(functor)) {}
+      : m_kernelFunctor(storage.copy_from(functor)) {}
 
   std::reference_wrapper<const Functor> get_functor() const {
-    return {*m_kernelFunctorPtr};
+    return {m_kernelFunctor};
+  }
+
+  static void register_event(Storage& storage, sycl::event event) {
+    storage.register_event(event);
   }
 };
 
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp
index a286169c45988339dce1b14c6d6a4ffde25dcea5..dca73683c3d1f06157affec3e8fe00feb7d36fd0 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp
@@ -47,11 +47,13 @@
 
 #include <impl/KokkosExp_IterateTileGPU.hpp>
 
-template <class FunctorType, class ExecPolicy>
-class Kokkos::Impl::ParallelFor<FunctorType, ExecPolicy,
+#include <vector>
+
+template <class FunctorType, class... Traits>
+class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
                                 Kokkos::Experimental::SYCL> {
  public:
-  using Policy = ExecPolicy;
+  using Policy = Kokkos::RangePolicy<Traits...>;
 
  private:
   using Member       = typename Policy::member_type;
@@ -62,16 +64,15 @@ class Kokkos::Impl::ParallelFor<FunctorType, ExecPolicy,
   const Policy m_policy;
 
   template <typename Functor>
-  static void sycl_direct_launch(const Policy& policy, const Functor& functor) {
+  static sycl::event sycl_direct_launch(const Policy& policy,
+                                        const Functor& functor) {
     // Convenience references
     const Kokkos::Experimental::SYCL& space = policy.space();
     Kokkos::Experimental::Impl::SYCLInternal& instance =
         *space.impl_internal_space_instance();
     sycl::queue& q = *instance.m_queue;
 
-    space.fence();
-
-    q.submit([functor, policy](sycl::handler& cgh) {
+    auto parallel_for_event = q.submit([functor, policy](sycl::handler& cgh) {
       sycl::range<1> range(policy.end() - policy.begin());
       const auto begin = policy.begin();
 
@@ -83,8 +84,9 @@ class Kokkos::Impl::ParallelFor<FunctorType, ExecPolicy,
           functor(WorkTag(), id);
       });
     });
+    q.submit_barrier(std::vector<sycl::event>{parallel_for_event});
 
-    space.fence();
+    return parallel_for_event;
   }
 
  public:
@@ -100,7 +102,9 @@ class Kokkos::Impl::ParallelFor<FunctorType, ExecPolicy,
 
     const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_functor, indirectKernelMem);
-    sycl_direct_launch(m_policy, functor_wrapper.get_functor());
+    sycl::event event =
+        sycl_direct_launch(m_policy, functor_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
   }
 
   ParallelFor(const ParallelFor&) = delete;
@@ -201,41 +205,48 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
   }
 
   template <typename Functor>
-  void sycl_direct_launch(const Functor& functor) const {
+  sycl::event sycl_direct_launch(const Functor& functor) const {
     // Convenience references
     Kokkos::Experimental::Impl::SYCLInternal& instance =
         *m_space.impl_internal_space_instance();
     sycl::queue& q = *instance.m_queue;
 
-    m_space.fence();
-
-    if (m_policy.m_num_tiles == 0) return;
+    if (m_policy.m_num_tiles == 0) return {};
 
     const BarePolicy bare_policy(m_policy);
 
-    q.submit([functor, this, bare_policy](sycl::handler& cgh) {
-      const auto range = compute_ranges();
-
-      cgh.parallel_for(range, [functor, bare_policy](sycl::nd_item<3> item) {
-        const index_type local_x    = item.get_local_id(0);
-        const index_type local_y    = item.get_local_id(1);
-        const index_type local_z    = item.get_local_id(2);
-        const index_type global_x   = item.get_group(0);
-        const index_type global_y   = item.get_group(1);
-        const index_type global_z   = item.get_group(2);
-        const index_type n_global_x = item.get_group_range(0);
-        const index_type n_global_y = item.get_group_range(1);
-        const index_type n_global_z = item.get_group_range(2);
-
-        Kokkos::Impl::DeviceIterateTile<Policy::rank, BarePolicy, Functor,
-                                        typename Policy::work_tag>(
-            bare_policy, functor, {n_global_x, n_global_y, n_global_z},
-            {global_x, global_y, global_z}, {local_x, local_y, local_z})
-            .exec_range();
-      });
-    });
-
-    m_space.fence();
+    auto parallel_for_event =
+        q.submit([functor, this, bare_policy](sycl::handler& cgh) {
+          const auto range                  = compute_ranges();
+          const sycl::range<3> global_range = range.get_global_range();
+          const sycl::range<3> local_range  = range.get_local_range();
+          const sycl::nd_range sycl_swapped_range{
+              sycl::range<3>{global_range[2], global_range[1], global_range[0]},
+              sycl::range<3>{local_range[2], local_range[1], local_range[0]}};
+
+          cgh.parallel_for(sycl_swapped_range, [functor, bare_policy](
+                                                   sycl::nd_item<3> item) {
+            // swap back for correct index calculations in DeviceIterateTile
+            const index_type local_x    = item.get_local_id(2);
+            const index_type local_y    = item.get_local_id(1);
+            const index_type local_z    = item.get_local_id(0);
+            const index_type global_x   = item.get_group(2);
+            const index_type global_y   = item.get_group(1);
+            const index_type global_z   = item.get_group(0);
+            const index_type n_global_x = item.get_group_range(2);
+            const index_type n_global_y = item.get_group_range(1);
+            const index_type n_global_z = item.get_group_range(0);
+
+            Kokkos::Impl::DeviceIterateTile<Policy::rank, BarePolicy, Functor,
+                                            typename Policy::work_tag>(
+                bare_policy, functor, {n_global_x, n_global_y, n_global_z},
+                {global_x, global_y, global_z}, {local_x, local_y, local_z})
+                .exec_range();
+          });
+        });
+    q.submit_barrier(std::vector<sycl::event>{parallel_for_event});
+
+    return parallel_for_event;
   }
 
  public:
@@ -253,7 +264,8 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 
     const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_functor, indirectKernelMem);
-    sycl_direct_launch(functor_wrapper.get_functor());
+    sycl::event event = sycl_direct_launch(functor_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
   }
 
   ParallelFor(const ParallelFor&) = delete;
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp
index 03b7753f8e81ef5045b16cedd4206d85174c0033..75237b4c72a4dbfc1b7ebe201dda240128f62ced 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp
@@ -46,14 +46,99 @@
 #define KOKKOS_SYCL_PARALLEL_REDUCE_HPP
 
 #include <Kokkos_Macros.hpp>
+
+#include <vector>
 #if defined(KOKKOS_ENABLE_SYCL)
+#include <Kokkos_Parallel_Reduce.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
+
 namespace Impl {
 
+namespace SYCLReduction {
+template <class ValueJoin, class ValueOps, typename WorkTag, typename ValueType,
+          typename ReducerType, typename FunctorType, int dim>
+void workgroup_reduction(sycl::nd_item<dim>& item,
+                         sycl::local_ptr<ValueType> local_mem,
+                         ValueType* results_ptr,
+                         ValueType* device_accessible_result_ptr,
+                         const unsigned int value_count,
+                         const ReducerType& selected_reducer,
+                         const FunctorType& functor, bool final) {
+  const auto local_id = item.get_local_linear_id();
+  // FIXME_SYCL should be item.get_group().get_local_linear_range();
+  size_t wgroup_size = 1;
+  for (unsigned int i = 0; i < dim; ++i) wgroup_size *= item.get_local_range(i);
+
+  // Perform the actual workgroup reduction in each subgroup
+  // separately.
+  auto sg                = item.get_sub_group();
+  auto* result           = &local_mem[local_id * value_count];
+  const auto id_in_sg    = sg.get_local_id()[0];
+  const auto local_range = std::min(sg.get_local_range()[0], wgroup_size);
+  for (unsigned int stride = 1; stride < local_range; stride <<= 1) {
+    if (id_in_sg + stride < local_range)
+      ValueJoin::join(selected_reducer, result,
+                      &local_mem[(local_id + stride) * value_count]);
+    sg.barrier();
+  }
+  item.barrier(sycl::access::fence_space::local_space);
+
+  // Copy the subgroup results into the first positions of the
+  // reduction array.
+  if (id_in_sg == 0)
+    ValueOps::copy(functor, &local_mem[sg.get_group_id()[0] * value_count],
+                   result);
+  item.barrier(sycl::access::fence_space::local_space);
+
+  // Do the final reduction only using the first subgroup.
+  if (sg.get_group_id()[0] == 0) {
+    const auto n_subgroups = sg.get_group_range()[0];
+    auto* result_          = &local_mem[id_in_sg * value_count];
+    // In case the number of subgroups is larger than the range of
+    // the first subgroup, we first combine the items with a higher
+    // index.
+    for (unsigned int offset = local_range; offset < n_subgroups;
+         offset += local_range)
+      if (id_in_sg + offset < n_subgroups)
+        ValueJoin::join(selected_reducer, result_,
+                        &local_mem[(id_in_sg + offset) * value_count]);
+    sg.barrier();
+
+    // Then, we proceed as before.
+    for (unsigned int stride = 1; stride < local_range; stride <<= 1) {
+      if (id_in_sg + stride < n_subgroups)
+        ValueJoin::join(selected_reducer, result_,
+                        &local_mem[(id_in_sg + stride) * value_count]);
+      sg.barrier();
+    }
+
+    // Finally, we copy the workgroup results back to global memory
+    // to be used in the next iteration. If this is the last
+    // iteration, i.e., there is only one workgroup also call
+    // final() if necessary.
+    if (id_in_sg == 0) {
+      if (final) {
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+          FunctorFinal<FunctorType, WorkTag>::final(functor, &local_mem[0]);
+        if (device_accessible_result_ptr != nullptr)
+          ValueOps::copy(functor, &device_accessible_result_ptr[0],
+                         &local_mem[0]);
+        else
+          ValueOps::copy(functor, &results_ptr[0], &local_mem[0]);
+      } else
+        ValueOps::copy(functor,
+                       &results_ptr[(item.get_group_linear_id()) * value_count],
+                       &local_mem[0]);
+    }
+  }
+}
+
+}  // namespace SYCLReduction
+
 template <class FunctorType, class ReducerType, class... Traits>
 class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                      Kokkos::Experimental::SYCL> {
@@ -76,19 +161,29 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   ParallelReduce(
       const FunctorType& f, const Policy& p, const V& v,
       typename std::enable_if<Kokkos::is_view<V>::value, void*>::type = nullptr)
-      : m_functor(f), m_policy(p), m_result_ptr(v.data()) {}
+      : m_functor(f),
+        m_policy(p),
+        m_result_ptr(v.data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename V::memory_space>::accessible) {}
 
   ParallelReduce(const FunctorType& f, const Policy& p,
                  const ReducerType& reducer)
       : m_functor(f),
         m_policy(p),
         m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {}
+        m_result_ptr(reducer.view().data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible) {}
 
  private:
   template <typename PolicyType, typename Functor, typename Reducer>
-  void sycl_direct_launch(const PolicyType& policy, const Functor& functor,
-                          const Reducer& reducer) const {
+  sycl::event sycl_direct_launch(const PolicyType& policy,
+                                 const Functor& functor,
+                                 const Reducer& reducer) const {
     using ReducerConditional =
         Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
                            FunctorType, ReducerType>;
@@ -121,18 +216,18 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     const unsigned int value_count =
         FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>::value_count(
             selected_reducer);
-    // FIXME_SYCL only use the first half
     const auto results_ptr = static_cast<pointer_type>(instance.scratch_space(
-        sizeof(value_type) * std::max(value_count, 1u) * init_size * 2));
-    // FIXME_SYCL without this we are running into a race condition
-    const auto results_ptr2 =
-        results_ptr + std::max(value_count, 1u) * init_size;
+        sizeof(value_type) * std::max(value_count, 1u) * init_size));
+    value_type* device_accessible_result_ptr =
+        m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+
+    sycl::event last_reduction_event;
 
     // If size<=1 we only call init(), the functor and possibly final once
     // working with the global scratch memory but don't copy back to
     // m_result_ptr yet.
     if (size <= 1) {
-      q.submit([&](sycl::handler& cgh) {
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         const auto begin = policy.begin();
         cgh.single_task([=]() {
           const auto& selected_reducer = ReducerConditional::select(
@@ -149,9 +244,13 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
           if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
             FunctorFinal<FunctorType, WorkTag>::final(
                 static_cast<const FunctorType&>(functor), results_ptr);
+          if (device_accessible_result_ptr != nullptr)
+            ValueOps::copy(functor, &device_accessible_result_ptr[0],
+                           &results_ptr[0]);
         });
       });
-      space.fence();
+      q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
+      last_reduction_event = parallel_reduce_event;
     }
 
     // Otherwise, we perform a reduction on the values in all workgroups
@@ -163,7 +262,7 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
       auto n_wgroups = ((size + values_per_thread - 1) / values_per_thread +
                         wgroup_size - 1) /
                        wgroup_size;
-      q.submit([&](sycl::handler& cgh) {
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         sycl::accessor<value_type, 1, sycl::access::mode::read_write,
                        sycl::access::target::local>
             local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u),
@@ -217,49 +316,15 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
               }
               item.barrier(sycl::access::fence_space::local_space);
 
-              // Perform the actual workgroup reduction. To achieve a better
-              // memory access pattern, we use sequential addressing and a
-              // reversed loop. If the workgroup size is 8, the first element
-              // contains all the values with index%4==0, after the second one
-              // the values with index%2==0 and after the third one index%1==0,
-              // i.e., all values.
-              for (unsigned int stride = wgroup_size / 2; stride > 0;
-                   stride >>= 1) {
-                const auto idx = local_id;
-                if (idx < stride) {
-                  ValueJoin::join(selected_reducer,
-                                  &local_mem[idx * value_count],
-                                  &local_mem[(idx + stride) * value_count]);
-                }
-                item.barrier(sycl::access::fence_space::local_space);
-              }
-
-              // Finally, we copy the workgroup results back to global memory to
-              // be used in the next iteration. If this is the last iteration,
-              // i.e., there is only one workgroup also call final() if
-              // necessary.
-              if (local_id == 0) {
-                ValueOps::copy(
-                    functor,
-                    &results_ptr2[(item.get_group_linear_id()) * value_count],
-                    &local_mem[0]);
-                if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
-                  if (n_wgroups <= 1)
-                    FunctorFinal<FunctorType, WorkTag>::final(
-                        static_cast<const FunctorType&>(functor),
-                        &results_ptr2[(item.get_group_linear_id()) *
-                                      value_count]);
-              }
+              SYCLReduction::workgroup_reduction<ValueJoin, ValueOps, WorkTag>(
+                  item, local_mem.get_pointer(), results_ptr,
+                  device_accessible_result_ptr, value_count, selected_reducer,
+                  static_cast<const FunctorType&>(functor), n_wgroups <= 1);
             });
       });
-      space.fence();
+      q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
 
-      // FIXME_SYCL this is likely not necessary, see above
-      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
-          space, results_ptr, results_ptr2,
-          sizeof(*m_result_ptr) * value_count * n_wgroups);
-      space.fence();
+      last_reduction_event = parallel_reduce_event;
 
       first_run = false;
       size      = n_wgroups;
@@ -268,13 +333,17 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     // At this point, the reduced value is written to the entry in results_ptr
     // and all that is left is to copy it back to the given result pointer if
     // necessary.
-    if (m_result_ptr) {
+    if (m_result_ptr && !m_result_ptr_device_accessible) {
       Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
                              Kokkos::Experimental::SYCLDeviceUSMSpace>(
           space, m_result_ptr, results_ptr,
           sizeof(*m_result_ptr) * value_count);
-      space.fence();
+      space.fence(
+          "Kokkos::Impl::ParallelReduce::sycl_direct_launch: fence due to "
+          "inaccessible reducer result location");
     }
+
+    return last_reduction_event;
   }
 
  public:
@@ -291,15 +360,18 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     const auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_reducer, indirectReducerMem);
 
-    sycl_direct_launch(m_policy, functor_wrapper.get_functor(),
-                       reducer_wrapper.get_functor());
+    sycl::event event = sycl_direct_launch(
+        m_policy, functor_wrapper.get_functor(), reducer_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
+    reducer_wrapper.register_event(indirectReducerMem, event);
   }
 
  private:
-  FunctorType m_functor;
-  Policy m_policy;
-  ReducerType m_reducer;
-  pointer_type m_result_ptr;
+  const FunctorType m_functor;
+  const Policy m_policy;
+  const ReducerType m_reducer;
+  const pointer_type m_result_ptr;
+  const bool m_result_ptr_device_accessible;
 };
 
 template <class FunctorType, class ReducerType, class... Traits>
@@ -347,7 +419,13 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   ParallelReduce(
       const FunctorType& f, const Policy& p, const V& v,
       typename std::enable_if<Kokkos::is_view<V>::value, void*>::type = nullptr)
-      : m_functor(f), m_policy(p), m_space(p.space()), m_result_ptr(v.data()) {}
+      : m_functor(f),
+        m_policy(p),
+        m_space(p.space()),
+        m_result_ptr(v.data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename V::memory_space>::accessible) {}
 
   ParallelReduce(const FunctorType& f, const Policy& p,
                  const ReducerType& reducer)
@@ -355,12 +433,17 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
         m_policy(p),
         m_space(p.space()),
         m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {}
+        m_result_ptr(reducer.view().data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible) {}
 
  private:
   template <typename PolicyType, typename Functor, typename Reducer>
-  void sycl_direct_launch(const PolicyType& policy, const Functor& functor,
-                          const Reducer& reducer) const {
+  sycl::event sycl_direct_launch(const PolicyType& policy,
+                                 const Functor& functor,
+                                 const Reducer& reducer) const {
     using ReducerConditional =
         Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
                            FunctorType, ReducerType>;
@@ -379,8 +462,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
         *m_space.impl_internal_space_instance();
     sycl::queue& q = *instance.m_queue;
 
-    const int nwork = m_policy.m_num_tiles;
-    const int block_size =
+    const typename Policy::index_type nwork = m_policy.m_num_tiles;
+    const typename Policy::index_type block_size =
         std::pow(2, std::ceil(std::log2(m_policy.m_prod_tile_dims)));
 
     const sycl::range<1> local_range(block_size);
@@ -402,12 +485,16 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     // FIXME_SYCL without this we are running into a race condition
     const auto results_ptr2 =
         results_ptr + std::max(value_count, 1u) * init_size;
+    value_type* device_accessible_result_ptr =
+        m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+
+    sycl::event last_reduction_event;
 
     // If size<=1 we only call init(), the functor and possibly final once
     // working with the global scratch memory but don't copy back to
     // m_result_ptr yet.
     if (size <= 1) {
-      q.submit([&](sycl::handler& cgh) {
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         cgh.single_task([=]() {
           const auto& selected_reducer = ReducerConditional::select(
               static_cast<const FunctorType&>(functor),
@@ -424,9 +511,13 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
           if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
             FunctorFinal<FunctorType, WorkTag>::final(
                 static_cast<const FunctorType&>(functor), results_ptr);
+          if (device_accessible_result_ptr)
+            ValueOps::copy(functor, &device_accessible_result_ptr[0],
+                           &results_ptr[0]);
         });
       });
-      m_space.fence();
+      q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
+      last_reduction_event = parallel_reduce_event;
     }
 
     // Otherwise, we perform a reduction on the values in all workgroups
@@ -435,8 +526,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     // value.
     bool first_run = true;
     while (size > 1) {
-      auto n_wgroups = (size + wgroup_size - 1) / wgroup_size;
-      q.submit([&](sycl::handler& cgh) {
+      auto n_wgroups             = (size + wgroup_size - 1) / wgroup_size;
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         sycl::accessor<value_type, 1, sycl::access::mode::read_write,
                        sycl::access::target::local>
             local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u),
@@ -498,47 +589,21 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
           }
           item.barrier(sycl::access::fence_space::local_space);
 
-          // Perform the actual workgroup reduction. To achieve a better
-          // memory access pattern, we use sequential addressing and a
-          // reversed loop. If the workgroup size is 8, the first element
-          // contains all the values with index%4==0, after the second one
-          // the values with index%2==0 and after the third one index%1==0,
-          // i.e., all values.
-          for (unsigned int stride = wgroup_size / 2; stride > 0;
-               stride >>= 1) {
-            const auto idx = local_id;
-            if (idx < stride) {
-              ValueJoin::join(selected_reducer, &local_mem[idx * value_count],
-                              &local_mem[(idx + stride) * value_count]);
-            }
-            item.barrier(sycl::access::fence_space::local_space);
-          }
-
-          // Finally, we copy the workgroup results back to global memory to
-          // be used in the next iteration. If this is the last iteration,
-          // i.e., there is only one workgroup also call final() if
-          // necessary.
-          if (local_id == 0) {
-            ValueOps::copy(
-                functor,
-                &results_ptr2[(item.get_group_linear_id()) * value_count],
-                &local_mem[0]);
-            if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
-              if (n_wgroups <= 1)
-                FunctorFinal<FunctorType, WorkTag>::final(
-                    static_cast<const FunctorType&>(functor),
-                    &results_ptr2[(item.get_group_linear_id()) * value_count]);
-          }
+          SYCLReduction::workgroup_reduction<ValueJoin, ValueOps, WorkTag>(
+              item, local_mem.get_pointer(), results_ptr2,
+              device_accessible_result_ptr, value_count, selected_reducer,
+              static_cast<const FunctorType&>(functor),
+              n_wgroups <= 1 && item.get_group_linear_id() == 0);
         });
       });
-      m_space.fence();
+      q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
 
       // FIXME_SYCL this is likely not necessary, see above
-      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
-          m_space, results_ptr, results_ptr2,
-          sizeof(*m_result_ptr) * value_count * n_wgroups);
-      m_space.fence();
+      auto deep_copy_event =
+          q.memcpy(results_ptr, results_ptr2,
+                   sizeof(*m_result_ptr) * value_count * n_wgroups);
+      q.submit_barrier(std::vector<sycl::event>{deep_copy_event});
+      last_reduction_event = deep_copy_event;
 
       first_run = false;
       size      = n_wgroups;
@@ -547,19 +612,23 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     // At this point, the reduced value is written to the entry in results_ptr
     // and all that is left is to copy it back to the given result pointer if
     // necessary.
-    if (m_result_ptr) {
+    if (m_result_ptr && !m_result_ptr_device_accessible) {
       Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
                              Kokkos::Experimental::SYCLDeviceUSMSpace>(
           m_space, m_result_ptr, results_ptr,
           sizeof(*m_result_ptr) * value_count);
-      m_space.fence();
+      m_space.fence(
+          "Kokkos::Impl::ParallelReduce::sycl_direct_launch: fence after deep "
+          "copying results back");
     }
+
+    return last_reduction_event;
   }
 
  public:
   template <typename Policy, typename Functor>
   static int max_tile_size_product(const Policy& policy, const Functor&) {
-    return policy.space().impl_internal_space_instance()->m_maxThreadsPerSM;
+    return policy.space().impl_internal_space_instance()->m_maxWorkgroupSize;
   }
 
   void execute() const {
@@ -575,16 +644,19 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     const auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_reducer, indirectReducerMem);
 
-    sycl_direct_launch(m_policy, functor_wrapper.get_functor(),
-                       reducer_wrapper.get_functor());
+    sycl::event event = sycl_direct_launch(
+        m_policy, functor_wrapper.get_functor(), reducer_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
+    reducer_wrapper.register_event(indirectReducerMem, event);
   }
 
  private:
-  FunctorType m_functor;
-  BarePolicy m_policy;
+  const FunctorType m_functor;
+  const BarePolicy m_policy;
   const Kokkos::Experimental::SYCL& m_space;
-  ReducerType m_reducer;
-  pointer_type m_result_ptr;
+  const ReducerType m_reducer;
+  const pointer_type m_result_ptr;
+  const bool m_result_ptr_device_accessible;
 };
 
 }  // namespace Impl
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp
index 5eac6bf9da62b29b9d15697bc5061c00db504e0c..d5611c2159bbc4bf0bd6a29fb89a941f7560650a 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp
@@ -47,6 +47,7 @@
 
 #include <Kokkos_Macros.hpp>
 #include <memory>
+#include <vector>
 #if defined(KOKKOS_ENABLE_SYCL)
 
 namespace Kokkos {
@@ -86,96 +87,99 @@ class ParallelScanSYCLBase {
   void scan_internal(sycl::queue& q, const Functor& functor,
                      pointer_type global_mem, std::size_t size) const {
     // FIXME_SYCL optimize
-    constexpr size_t wgroup_size = 32;
+    constexpr size_t wgroup_size = 128;
     auto n_wgroups               = (size + wgroup_size - 1) / wgroup_size;
+    pointer_type group_results   = global_mem + n_wgroups * wgroup_size;
 
-    // FIXME_SYCL The allocation should be handled by the execution space
-    auto deleter = [&q](value_type* ptr) { sycl::free(ptr, q); };
-    std::unique_ptr<value_type[], decltype(deleter)> group_results_memory(
-        static_cast<pointer_type>(sycl::malloc(sizeof(value_type) * n_wgroups,
-                                               q, sycl::usm::alloc::shared)),
-        deleter);
-    auto group_results = group_results_memory.get();
-
-    q.submit([&](sycl::handler& cgh) {
+    auto local_scans = q.submit([&](sycl::handler& cgh) {
       sycl::accessor<value_type, 1, sycl::access::mode::read_write,
                      sycl::access::target::local>
           local_mem(sycl::range<1>(wgroup_size), cgh);
 
-      // FIXME_SYCL we get wrong results without this, not sure why
-      sycl::stream out(1, 1, cgh);
       cgh.parallel_for(
           sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size),
           [=](sycl::nd_item<1> item) {
-            const auto local_id  = item.get_local_linear_id();
-            const auto global_id = item.get_global_linear_id();
+            const auto local_id      = item.get_local_linear_id();
+            const auto global_id     = item.get_global_linear_id();
+            const auto global_offset = global_id - local_id;
 
             // Initialize local memory
             if (global_id < size)
-              ValueOps::copy(functor, &local_mem[local_id],
-                             &global_mem[global_id]);
+              local_mem[local_id] = global_mem[global_id];
             else
               ValueInit::init(functor, &local_mem[local_id]);
             item.barrier(sycl::access::fence_space::local_space);
 
-            // Perform workgroup reduction
-            for (size_t stride = 1; 2 * stride < wgroup_size + 1; stride *= 2) {
-              auto idx = 2 * stride * (local_id + 1) - 1;
-              if (idx < wgroup_size)
-                ValueJoin::join(functor, &local_mem[idx],
-                                &local_mem[idx - stride]);
-              item.barrier(sycl::access::fence_space::local_space);
+            // subgroup scans
+            auto sg                = item.get_sub_group();
+            const auto sg_group_id = sg.get_group_id()[0];
+            const int id_in_sg     = sg.get_local_id()[0];
+            for (int stride = wgroup_size / 2; stride > 0; stride >>= 1) {
+              auto tmp = sg.shuffle_up(local_mem[local_id], stride);
+              if (id_in_sg >= stride)
+                ValueJoin::join(functor, &local_mem[local_id], &tmp);
             }
 
-            if (local_id == 0) {
-              if (n_wgroups > 1)
-                ValueOps::copy(functor,
-                               &group_results[item.get_group_linear_id()],
-                               &local_mem[wgroup_size - 1]);
-              else
-                ValueInit::init(functor,
-                                &group_results[item.get_group_linear_id()]);
-              ValueInit::init(functor, &local_mem[wgroup_size - 1]);
-            }
+            const int local_range = sg.get_local_range()[0];
+            if (id_in_sg == local_range - 1)
+              global_mem[sg_group_id + global_offset] = local_mem[local_id];
+            local_mem[local_id] = sg.shuffle_up(local_mem[local_id], 1);
+            if (id_in_sg == 0) ValueInit::init(functor, &local_mem[local_id]);
+            item.barrier(sycl::access::fence_space::local_space);
 
-            // Add results to all items
-            for (size_t stride = wgroup_size / 2; stride > 0; stride /= 2) {
-              auto idx = 2 * stride * (local_id + 1) - 1;
-              if (idx < wgroup_size) {
-                value_type dummy;
-                ValueOps::copy(functor, &dummy, &local_mem[idx - stride]);
-                ValueOps::copy(functor, &local_mem[idx - stride],
-                               &local_mem[idx]);
-                ValueJoin::join(functor, &local_mem[idx], &dummy);
+            // scan subgroup results using the first subgroup
+            if (sg_group_id == 0) {
+              const int n_subgroups = sg.get_group_range()[0];
+              if (local_range < n_subgroups) Kokkos::abort("Not implemented!");
+
+              for (int stride = n_subgroups / 2; stride > 0; stride >>= 1) {
+                auto tmp =
+                    sg.shuffle_up(global_mem[id_in_sg + global_offset], stride);
+                if (id_in_sg >= stride) {
+                  if (id_in_sg < n_subgroups)
+                    ValueJoin::join(
+                        functor, &global_mem[id_in_sg + global_offset], &tmp);
+                  else
+                    global_mem[id_in_sg + global_offset] = tmp;
+                }
               }
-              item.barrier(sycl::access::fence_space::local_space);
             }
+            item.barrier(sycl::access::fence_space::local_space);
+
+            // add results to all subgroups
+            if (sg_group_id > 0)
+              ValueJoin::join(functor, &local_mem[local_id],
+                              &global_mem[sg_group_id - 1 + global_offset]);
+            item.barrier(sycl::access::fence_space::local_space);
+            if (n_wgroups > 1 && local_id == wgroup_size - 1)
+              group_results[item.get_group_linear_id()] =
+                  global_mem[sg_group_id + global_offset];
+            item.barrier(sycl::access::fence_space::local_space);
 
             // Write results to global memory
-            if (global_id < size)
-              ValueOps::copy(functor, &global_mem[global_id],
-                             &local_mem[local_id]);
+            if (global_id < size) global_mem[global_id] = local_mem[local_id];
           });
     });
-
-    if (n_wgroups > 1) scan_internal(q, functor, group_results, n_wgroups);
-    m_policy.space().fence();
-
-    q.submit([&](sycl::handler& cgh) {
-      cgh.parallel_for(sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size),
-                       [=](sycl::nd_item<1> item) {
-                         const auto global_id = item.get_global_linear_id();
-                         if (global_id < size)
-                           ValueJoin::join(
-                               functor, &global_mem[global_id],
-                               &group_results[item.get_group_linear_id()]);
-                       });
-    });
-    m_policy.space().fence();
+    q.submit_barrier(std::vector<sycl::event>{local_scans});
+
+    if (n_wgroups > 1) {
+      scan_internal(q, functor, group_results, n_wgroups);
+      auto update_with_group_results = q.submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(
+            sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size),
+            [=](sycl::nd_item<1> item) {
+              const auto global_id = item.get_global_linear_id();
+              if (global_id < size)
+                ValueJoin::join(functor, &global_mem[global_id],
+                                &group_results[item.get_group_linear_id()]);
+            });
+      });
+      q.submit_barrier(std::vector<sycl::event>{update_with_group_results});
+    }
   }
 
   template <typename Functor>
-  void sycl_direct_launch(const Functor& functor) const {
+  sycl::event sycl_direct_launch(const Functor& functor) const {
     // Convenience references
     const Kokkos::Experimental::SYCL& space = m_policy.space();
     Kokkos::Experimental::Impl::SYCLInternal& instance =
@@ -185,7 +189,7 @@ class ParallelScanSYCLBase {
     const std::size_t len = m_policy.end() - m_policy.begin();
 
     // Initialize global memory
-    q.submit([&](sycl::handler& cgh) {
+    auto initialize_global_memory = q.submit([&](sycl::handler& cgh) {
       auto global_mem = m_scratch_space;
       auto begin      = m_policy.begin();
       cgh.parallel_for(sycl::range<1>(len), [=](sycl::item<1> item) {
@@ -197,29 +201,30 @@ class ParallelScanSYCLBase {
           functor(id, update, false);
         else
           functor(WorkTag(), id, update, false);
-        ValueOps::copy(functor, &global_mem[id], &update);
+        global_mem[id] = update;
       });
     });
-    space.fence();
+    q.submit_barrier(std::vector<sycl::event>{initialize_global_memory});
 
-    // Perform the actual exlcusive scan
+    // Perform the actual exclusive scan
     scan_internal(q, functor, m_scratch_space, len);
 
     // Write results to global memory
-    q.submit([&](sycl::handler& cgh) {
+    auto update_global_results = q.submit([&](sycl::handler& cgh) {
       auto global_mem = m_scratch_space;
       cgh.parallel_for(sycl::range<1>(len), [=](sycl::item<1> item) {
-        auto global_id = item.get_id();
+        auto global_id = item.get_id(0);
 
         value_type update = global_mem[global_id];
         if constexpr (std::is_same<WorkTag, void>::value)
           functor(global_id, update, true);
         else
           functor(WorkTag(), global_id, update, true);
-        ValueOps::copy(functor, &global_mem[global_id], &update);
+        global_mem[global_id] = update;
       });
     });
-    space.fence();
+    q.submit_barrier(std::vector<sycl::event>{update_global_results});
+    return update_global_results;
   }
 
  public:
@@ -227,28 +232,39 @@ class ParallelScanSYCLBase {
   void impl_execute(const PostFunctor& post_functor) {
     if (m_policy.begin() == m_policy.end()) return;
 
-    const auto& q = *m_policy.space().impl_internal_space_instance()->m_queue;
+    auto& instance        = *m_policy.space().impl_internal_space_instance();
     const std::size_t len = m_policy.end() - m_policy.begin();
 
-    // FIXME_SYCL The allocation should be handled by the execution space
-    // consider only storing one value per block and recreate initial results in
-    // the end before doing the final pass
-    auto deleter = [&q](value_type* ptr) { sycl::free(ptr, q); };
-    std::unique_ptr<value_type[], decltype(deleter)> result_memory(
-        static_cast<pointer_type>(sycl::malloc(sizeof(value_type) * len, q,
-                                               sycl::usm::alloc::shared)),
-        deleter);
-    m_scratch_space = result_memory.get();
+    // Compute the total amount of memory we will need. We emulate the recursive
+    // structure that is used to do the actual scan. Essentially, we need to
+    // allocate memory for the whole range and then recursively for the reduced
+    // group results until only one group is left.
+    std::size_t total_memory = 0;
+    {
+      size_t wgroup_size   = 128;
+      size_t n_nested_size = len;
+      size_t n_nested_wgroups;
+      do {
+        n_nested_wgroups = (n_nested_size + wgroup_size - 1) / wgroup_size;
+        n_nested_size    = n_nested_wgroups;
+        total_memory += sizeof(value_type) * n_nested_wgroups * wgroup_size;
+      } while (n_nested_wgroups > 1);
+      total_memory += sizeof(value_type) * wgroup_size;
+    }
+
+    // FIXME_SYCL consider only storing one value per block and recreate initial
+    // results in the end before doing the final pass
+    m_scratch_space =
+        static_cast<pointer_type>(instance.scratch_space(total_memory));
 
     Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem&
-        indirectKernelMem = m_policy.space()
-                                .impl_internal_space_instance()
-                                ->m_indirectKernelMem;
+        indirectKernelMem = instance.m_indirectKernelMem;
 
     const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_functor, indirectKernelMem);
 
-    sycl_direct_launch(functor_wrapper.get_functor());
+    sycl::event event = sycl_direct_launch(functor_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
     post_functor();
   }
 
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
index 738620926b5496b9710ce001b77c6fb625325320..9538bf708077cc50404e66e19e048d4341a19761 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
@@ -47,8 +47,11 @@
 
 #include <Kokkos_Parallel.hpp>
 
+#include <SYCL/Kokkos_SYCL_Parallel_Reduce.hpp>  // workgroup_reduction
 #include <SYCL/Kokkos_SYCL_Team.hpp>
 
+#include <vector>
+
 namespace Kokkos {
 namespace Impl {
 template <typename... Properties>
@@ -63,8 +66,6 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
   friend class TeamPolicyInternal;
 
  private:
-  static int constexpr MAX_WARP = 8;
-
   typename traits::execution_space m_space;
   int m_league_size;
   int m_team_size;
@@ -128,11 +129,18 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
   }
   inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
   inline bool impl_auto_team_size() const { return m_tune_team_size; }
+  // FIXME_SYCL This is correct in most cases, but not necessarily in case a
+  // custom sycl::queue is used to initialize the execution space.
   static int vector_length_max() {
-    // FIXME_SYCL provide a reasonable value
-    return 1;
+    std::vector<size_t> sub_group_sizes =
+        execution_space{}
+            .impl_internal_space_instance()
+            ->m_queue->get_device()
+            .template get_info<sycl::info::device::sub_group_sizes>();
+    return *std::max_element(sub_group_sizes.begin(), sub_group_sizes.end());
   }
 
+ private:
   static int verify_requested_vector_length(int requested_vector_length) {
     int test_vector_length =
         std::min(requested_vector_length, vector_length_max());
@@ -140,18 +148,14 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
     // Allow only power-of-two vector_length
     if (!(is_integral_power_of_two(test_vector_length))) {
       int test_pow2 = 1;
-      for (int i = 0; i < 5; i++) {
-        test_pow2 = test_pow2 << 1;
-        if (test_pow2 > test_vector_length) {
-          break;
-        }
-      }
+      while (test_pow2 < test_vector_length) test_pow2 <<= 1;
       test_vector_length = test_pow2 >> 1;
     }
 
     return test_vector_length;
   }
 
+ public:
   static int scratch_size_max(int level) {
     return level == 0 ? 1024 * 32
                       :           // FIXME_SYCL arbitrarily setting this to 32kB
@@ -160,7 +164,9 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
   inline void impl_set_vector_length(size_t size) { m_vector_length = size; }
   inline void impl_set_team_size(size_t size) { m_team_size = size; }
   int impl_vector_length() const { return m_vector_length; }
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   KOKKOS_DEPRECATED int vector_length() const { return impl_vector_length(); }
+#endif
 
   int team_size() const { return m_team_size; }
 
@@ -206,7 +212,21 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
         m_chunk_size(0),
         m_tune_team_size(bool(team_size_request <= 0)),
         m_tune_vector_length(bool(vector_length_request <= 0)) {
-    // FIXME_SYCL check paramters
+    // FIXME_SYCL Check that league size is permissible,
+    // https://github.com/intel/llvm/pull/4064
+
+    // Make sure total block size is permissible
+    if (m_team_size * m_vector_length >
+        static_cast<int>(
+            m_space.impl_internal_space_instance()->m_maxWorkgroupSize)) {
+      Impl::throw_runtime_exception(
+          std::string("Kokkos::TeamPolicy<SYCL> the team size is too large. "
+                      "Team size x vector length is " +
+                      std::to_string(m_team_size * m_vector_length) +
+                      " but must be smaller than ") +
+          std::to_string(
+              m_space.impl_internal_space_instance()->m_maxWorkgroupSize));
+    }
   }
 
   /** \brief  Specify league size, request team size */
@@ -311,8 +331,9 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
          2 * sizeof(double) - m_team_scratch_size[0]) /
         (sizeof(double) + m_thread_scratch_size[0]);
     return std::min<int>(
-        m_space.impl_internal_space_instance()->m_maxWorkgroupSize,
-        max_threads_for_memory);
+               m_space.impl_internal_space_instance()->m_maxWorkgroupSize,
+               max_threads_for_memory) /
+           impl_vector_length();
   }
 
   template <class FunctorType>
@@ -335,8 +356,9 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
         (sizeof(double) + sizeof(value_type) * value_count +
          m_thread_scratch_size[0]);
     return std::min<int>(
-        m_space.impl_internal_space_instance()->m_maxWorkgroupSize,
-        max_threads_for_memory);
+               m_space.impl_internal_space_instance()->m_maxWorkgroupSize,
+               max_threads_for_memory) /
+           impl_vector_length();
   }
 
   template <class FunctorType>
@@ -376,14 +398,15 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   int m_scratch_size[2];
 
   template <typename Functor>
-  void sycl_direct_launch(const Policy& policy, const Functor& functor) const {
+  sycl::event sycl_direct_launch(const Policy& policy,
+                                 const Functor& functor) const {
     // Convenience references
     const Kokkos::Experimental::SYCL& space = policy.space();
     Kokkos::Experimental::Impl::SYCLInternal& instance =
         *space.impl_internal_space_instance();
     sycl::queue& q = *instance.m_queue;
 
-    q.submit([&](sycl::handler& cgh) {
+    auto parallel_for_event = q.submit([&](sycl::handler& cgh) {
       // FIXME_SYCL accessors seem to need a size greater than zero at least for
       // host queues
       sycl::accessor<char, 1, sycl::access::mode::read_write,
@@ -399,14 +422,22 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
       cgh.parallel_for(
           sycl::nd_range<2>(
-              sycl::range<2>(m_league_size * m_team_size, m_vector_size),
+              sycl::range<2>(m_team_size, m_league_size * m_vector_size),
               sycl::range<2>(m_team_size, m_vector_size)),
           [=](sycl::nd_item<2> item) {
+#ifdef KOKKOS_ENABLE_DEBUG
+            if (item.get_sub_group().get_local_range() %
+                    item.get_local_range(1) !=
+                0)
+              Kokkos::abort(
+                  "The sub_group size is not divisible by the vector_size. "
+                  "Choose a smaller vector_size!");
+#endif
             const member_type team_member(
                 team_scratch_memory_L0.get_pointer(), shmem_begin,
                 scratch_size[0],
                 static_cast<char*>(scratch_ptr[1]) +
-                    item.get_group(0) * scratch_size[1],
+                    item.get_group(1) * scratch_size[1],
                 scratch_size[1], item);
             if constexpr (std::is_same<work_tag, void>::value)
               functor(team_member);
@@ -414,7 +445,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
               functor(work_tag(), team_member);
           });
     });
-    space.fence();
+    q.submit_barrier(std::vector<sycl::event>{parallel_for_event});
+    return parallel_for_event;
   }
 
  public:
@@ -429,7 +461,9 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_functor, indirectKernelMem);
 
-    sycl_direct_launch(m_policy, functor_wrapper.get_functor());
+    sycl::event event =
+        sycl_direct_launch(m_policy, functor_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
   }
 
   ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy)
@@ -451,11 +485,10 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // FIXME_SYCL so far accessors used instead of these pointers
     // Functor's reduce memory, team scan memory, and team shared memory depend
     // upon team size.
-    const auto& space    = *m_policy.space().impl_internal_space_instance();
-    const sycl::queue& q = *space.m_queue;
-    m_scratch_ptr[0]     = nullptr;
-    m_scratch_ptr[1]     = sycl::malloc_device(
-        sizeof(char) * m_scratch_size[1] * m_league_size, q);
+    auto& space      = *m_policy.space().impl_internal_space_instance();
+    m_scratch_ptr[0] = nullptr;
+    m_scratch_ptr[1] = space.resize_team_scratch_space(
+        static_cast<ptrdiff_t>(m_scratch_size[1]) * m_league_size);
 
     if (static_cast<int>(space.m_maxShmemPerBlock) <
         m_shmem_size - m_shmem_begin) {
@@ -463,27 +496,17 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
       out << "Kokkos::Impl::ParallelFor<SYCL> insufficient shared memory! "
              "Requested "
           << m_shmem_size - m_shmem_begin << " bytes but maximum is "
-          << m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock
-          << '\n';
+          << space.m_maxShmemPerBlock << '\n';
       Kokkos::Impl::throw_runtime_exception(out.str());
     }
 
+    const auto max_team_size =
+        m_policy.team_size_max(arg_functor, ParallelForTag{});
     if (m_team_size > m_policy.team_size_max(arg_functor, ParallelForTag{}))
       Kokkos::Impl::throw_runtime_exception(
-          "Kokkos::Impl::ParallelFor<SYCL> requested too large team size.");
-  }
-
-  // FIXME_SYCL remove when managing m_scratch_ptr[1] in the execution space
-  // instance
-  ParallelFor(const ParallelFor&) = delete;
-  ParallelFor& operator=(const ParallelFor&) = delete;
-
-  ~ParallelFor() {
-    const Kokkos::Experimental::SYCL& space = m_policy.space();
-    Kokkos::Experimental::Impl::SYCLInternal& instance =
-        *space.impl_internal_space_instance();
-    sycl::queue& q = *instance.m_queue;
-    sycl::free(m_scratch_ptr[1], q);
+          "Kokkos::Impl::ParallelFor<SYCL> requested too large team size. The "
+          "maximal team_size is " +
+          std::to_string(max_team_size) + '!');
   }
 };
 
@@ -516,6 +539,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   const Policy m_policy;
   const ReducerType m_reducer;
   const pointer_type m_result_ptr;
+  const bool m_result_ptr_device_accessible;
   // FIXME_SYCL avoid reallocating memory for reductions
   /*  size_type* m_scratch_space;
     size_type* m_scratch_flags;
@@ -529,8 +553,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   const size_type m_vector_size;
 
   template <typename PolicyType, typename Functor, typename Reducer>
-  void sycl_direct_launch(const PolicyType& policy, const Functor& functor,
-                          const Reducer& reducer) const {
+  sycl::event sycl_direct_launch(const PolicyType& policy,
+                                 const Functor& functor,
+                                 const Reducer& reducer) const {
     using ReducerConditional =
         Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
                            FunctorType, ReducerType>;
@@ -553,25 +578,25 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     sycl::queue& q = *instance.m_queue;
 
     // FIXME_SYCL optimize
-    const size_t wgroup_size = m_team_size;
-    std::size_t size         = m_league_size * m_team_size;
+    const size_t wgroup_size = m_team_size * m_vector_size;
+    std::size_t size         = m_league_size * m_team_size * m_vector_size;
     const auto init_size =
         std::max<std::size_t>((size + wgroup_size - 1) / wgroup_size, 1);
     const unsigned int value_count =
         FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>::value_count(
             selected_reducer);
-    // FIXME_SYCL only use the first half
     const auto results_ptr = static_cast<pointer_type>(instance.scratch_space(
-        sizeof(value_type) * std::max(value_count, 1u) * init_size * 2));
-    // FIXME_SYCL without this we are running into a race condition
-    const auto results_ptr2 =
-        results_ptr + std::max(value_count, 1u) * init_size;
+        sizeof(value_type) * std::max(value_count, 1u) * init_size));
+    value_type* device_accessible_result_ptr =
+        m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+
+    sycl::event last_reduction_event;
 
     // If size<=1 we only call init(), the functor and possibly final once
     // working with the global scratch memory but don't copy back to
     // m_result_ptr yet.
     if (size <= 1) {
-      q.submit([&](sycl::handler& cgh) {
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         // FIXME_SYCL accessors seem to need a size greater than zero at least
         // for host queues
         sycl::accessor<char, 1, sycl::access::mode::read_write,
@@ -606,9 +631,13 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
               if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
                 FunctorFinal<FunctorType, WorkTag>::final(
                     static_cast<const FunctorType&>(functor), results_ptr);
+              if (device_accessible_result_ptr)
+                ValueOps::copy(functor, device_accessible_result_ptr,
+                               &results_ptr[0]);
             });
       });
-      space.fence();
+      q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
+      last_reduction_event = parallel_reduce_event;
     }
 
     // Otherwise, we perform a reduction on the values in all workgroups
@@ -617,8 +646,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // value.
     bool first_run = true;
     while (size > 1) {
-      auto n_wgroups = (size + wgroup_size - 1) / wgroup_size;
-      q.submit([&](sycl::handler& cgh) {
+      auto n_wgroups             = (size + wgroup_size - 1) / wgroup_size;
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         sycl::accessor<value_type, 1, sycl::access::mode::read_write,
                        sycl::access::target::local>
             local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u),
@@ -638,9 +667,17 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
         cgh.parallel_for(
             sycl::nd_range<2>(
-                sycl::range<2>(m_league_size * m_team_size, m_vector_size),
+                sycl::range<2>(m_team_size, m_league_size * m_vector_size),
                 sycl::range<2>(m_team_size, m_vector_size)),
             [=](sycl::nd_item<2> item) {
+#ifdef KOKKOS_ENABLE_DEBUG
+              if (first_run && item.get_sub_group().get_local_range() %
+                                       item.get_local_range(1) !=
+                                   0)
+                Kokkos::abort(
+                    "The sub_group size is not divisible by the vector_size. "
+                    "Choose a smaller vector_size!");
+#endif
               const auto local_id = item.get_local_linear_id();
               const auto global_id =
                   wgroup_size * item.get_group_linear_id() + local_id;
@@ -651,9 +688,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
               // In the first iteration, we call functor to initialize the local
               // memory. Otherwise, the local memory is initialized with the
               // results from the previous iteration that are stored in global
-              // memory. Note that we load values_per_thread values per thread
-              // and immediately combine them to avoid too many threads being
-              // idle in the actual workgroup reduction.
+              // memory.
               if (first_run) {
                 reference_type update = ValueInit::init(
                     selected_reducer, &local_mem[local_id * value_count]);
@@ -661,7 +696,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                     team_scratch_memory_L0.get_pointer(), shmem_begin,
                     scratch_size[0],
                     static_cast<char*>(scratch_ptr[1]) +
-                        item.get_group(0) * scratch_size[1],
+                        item.get_group(1) * scratch_size[1],
                     scratch_size[1], item);
                 if constexpr (std::is_same<WorkTag, void>::value)
                   functor(team_member, update);
@@ -678,50 +713,18 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
               }
               item.barrier(sycl::access::fence_space::local_space);
 
-              // Perform the actual workgroup reduction. To achieve a better
-              // memory access pattern, we use sequential addressing and a
-              // reversed loop. If the workgroup size is 8, the first element
-              // contains all the values with index%4==0, after the second one
-              // the values with index%2==0 and after the third one index%1==0,
-              // i.e., all values.
-              for (unsigned int stride = wgroup_size / 2; stride > 0;
-                   stride >>= 1) {
-                const auto idx = local_id;
-                if (idx < stride) {
-                  ValueJoin::join(selected_reducer,
-                                  &local_mem[idx * value_count],
-                                  &local_mem[(idx + stride) * value_count]);
-                }
-                item.barrier(sycl::access::fence_space::local_space);
-              }
+              SYCLReduction::workgroup_reduction<ValueJoin, ValueOps, WorkTag>(
+                  item, local_mem.get_pointer(), results_ptr,
+                  device_accessible_result_ptr, value_count, selected_reducer,
+                  static_cast<const FunctorType&>(functor),
+                  n_wgroups <= 1 && item.get_group_linear_id() == 0);
 
-              // Finally, we copy the workgroup results back to global memory to
-              // be used in the next iteration. If this is the last iteration,
-              // i.e., there is only one workgroup also call final() if
-              // necessary.
-              if (local_id == 0) {
-                ValueOps::copy(
-                    functor,
-                    &results_ptr2[(item.get_group_linear_id()) * value_count],
-                    &local_mem[0]);
-                if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
-                  if (n_wgroups <= 1 && item.get_group_linear_id() == 0) {
-                    FunctorFinal<FunctorType, WorkTag>::final(
-                        static_cast<const FunctorType&>(functor),
-                        &results_ptr2[(item.get_group_linear_id()) *
-                                      value_count]);
-                  }
-              }
+              // FIXME_SYCL not quite sure why this is necessary
+              item.barrier(sycl::access::fence_space::global_space);
             });
       });
-      space.fence();
-
-      // FIXME_SYCL this is likely not necessary, see above
-      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
-          space, results_ptr, results_ptr2,
-          sizeof(*m_result_ptr) * value_count * n_wgroups);
-      space.fence();
+      q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
+      last_reduction_event = parallel_reduce_event;
 
       first_run = false;
       size      = n_wgroups;
@@ -730,13 +733,17 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // At this point, the reduced value is written to the entry in results_ptr
     // and all that is left is to copy it back to the given result pointer if
     // necessary.
-    if (m_result_ptr) {
+    if (m_result_ptr && !m_result_ptr_device_accessible) {
       Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
                              Kokkos::Experimental::SYCLDeviceUSMSpace>(
           space, m_result_ptr, results_ptr,
           sizeof(*m_result_ptr) * value_count);
-      space.fence();
+      space.fence(
+          "Kokkos::Impl::ParallelReduce<TeamPolicy,SYCL>: fence because "
+          "reduction can't access result storage location");
     }
+
+    return last_reduction_event;
   }
 
  public:
@@ -753,8 +760,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     const auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_reducer, indirectReducerMem);
 
-    sycl_direct_launch(m_policy, functor_wrapper.get_functor(),
-                       reducer_wrapper.get_functor());
+    sycl::event event = sycl_direct_launch(
+        m_policy, functor_wrapper.get_functor(), reducer_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
+    reducer_wrapper.register_event(indirectReducerMem, event);
   }
 
  private:
@@ -779,11 +788,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // FIXME_SYCL so far accessors used instead of these pointers
     // Functor's reduce memory, team scan memory, and team shared memory depend
     // upon team size.
-    const auto& space    = *m_policy.space().impl_internal_space_instance();
-    const sycl::queue& q = *space.m_queue;
-    m_scratch_ptr[0]     = nullptr;
-    m_scratch_ptr[1]     = sycl::malloc_device(
-        sizeof(char) * m_scratch_size[1] * m_league_size, q);
+    auto& space      = *m_policy.space().impl_internal_space_instance();
+    m_scratch_ptr[0] = nullptr;
+    m_scratch_ptr[1] = space.resize_team_scratch_space(
+        static_cast<ptrdiff_t>(m_scratch_size[1]) * m_league_size);
 
     if (static_cast<int>(space.m_maxShmemPerBlock) <
         m_shmem_size - m_shmem_begin) {
@@ -791,8 +799,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
       out << "Kokkos::Impl::ParallelFor<SYCL> insufficient shared memory! "
              "Requested "
           << m_shmem_size - m_shmem_begin << " bytes but maximum is "
-          << m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock
-          << '\n';
+          << space.m_maxShmemPerBlock << '\n';
       Kokkos::Impl::throw_runtime_exception(out.str());
     }
 
@@ -811,6 +818,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_policy(arg_policy),
         m_reducer(InvalidType()),
         m_result_ptr(arg_result.data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename ViewType::memory_space>::accessible),
         m_league_size(arg_policy.league_size()),
         m_team_size(arg_policy.team_size()),
         m_vector_size(arg_policy.impl_vector_length()) {
@@ -823,6 +833,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_policy(arg_policy),
         m_reducer(reducer),
         m_result_ptr(reducer.view().data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible),
         m_league_size(arg_policy.league_size()),
         m_team_size(arg_policy.team_size()),
         m_vector_size(arg_policy.impl_vector_length()) {
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp
index 75741438e295c543db2737e6943ea52e244d69db..6ec6204e711586b0d88d6882955d21bf830a5327 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp
@@ -56,64 +56,22 @@
 /*--------------------------------------------------------------------------*/
 namespace Kokkos {
 namespace Impl {
-namespace {
-auto USM_memcpy(sycl::queue& q, void* dst, const void* src, size_t n) {
-  return q.memcpy(dst, src, n);
-}
-
-void USM_memcpy(Kokkos::Experimental::Impl::SYCLInternal& space, void* dst,
-                const void* src, size_t n) {
-  (void)USM_memcpy(*space.m_queue, dst, src, n);
-}
-
-void USM_memcpy(void* dst, const void* src, size_t n) {
-  Experimental::SYCL().fence();
-  auto event = USM_memcpy(
-      *Experimental::Impl::SYCLInternal::singleton().m_queue, dst, src, n);
-  Experimental::Impl::SYCLInternal::fence(event);
-}
-}  // namespace
-
-DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-         Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::Experimental::SYCL>::
-    DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst,
-             const void* src, size_t n) {
-  USM_memcpy(*instance.impl_internal_space_instance(), dst, src, n);
-}
 
-DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-         Kokkos::Experimental::SYCLDeviceUSMSpace,
-         Kokkos::Experimental::SYCL>::DeepCopy(void* dst, const void* src,
-                                               size_t n) {
-  USM_memcpy(dst, src, n);
+void DeepCopySYCL(void* dst, const void* src, size_t n) {
+  Experimental::SYCL().fence("Kokkos::Impl::DeepCopySYCL: fence before memcpy");
+  Experimental::Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n);
+  Experimental::SYCL().fence("Kokkos::Impl::DeepCopySYCL: fence after memcpy");
 }
 
-DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-         Kokkos::Experimental::SYCL>::DeepCopy(const Kokkos::Experimental::SYCL&
-                                                   instance,
-                                               void* dst, const void* src,
-                                               size_t n) {
-  USM_memcpy(*instance.impl_internal_space_instance(), dst, src, n);
+void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst,
+                       const void* src, size_t n) {
+  instance.impl_internal_space_instance()->m_queue->memcpy(dst, src, n);
 }
 
-DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-         Kokkos::Experimental::SYCL>::DeepCopy(void* dst, const void* src,
-                                               size_t n) {
-  USM_memcpy(dst, src, n);
-}
-
-DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
-         Kokkos::Experimental::SYCL>::DeepCopy(const Kokkos::Experimental::SYCL&
-                                                   instance,
-                                               void* dst, const void* src,
-                                               size_t n) {
-  USM_memcpy(*instance.impl_internal_space_instance(), dst, src, n);
-}
-
-DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
-         Kokkos::Experimental::SYCL>::DeepCopy(void* dst, const void* src,
-                                               size_t n) {
-  USM_memcpy(dst, src, n);
+void DeepCopyAsyncSYCL(void* dst, const void* src, size_t n) {
+  Experimental::Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n);
+  Experimental::SYCL().fence(
+      "Kokkos::Impl::DeepCopyAsyncSYCL: fence after memcpy");
 }
 
 }  // namespace Impl
@@ -135,6 +93,11 @@ SYCLSharedUSMSpace::SYCLSharedUSMSpace()
 SYCLSharedUSMSpace::SYCLSharedUSMSpace(sycl::queue queue)
     : m_queue(std::move(queue)) {}
 
+SYCLHostUSMSpace::SYCLHostUSMSpace()
+    : m_queue(*SYCL().impl_internal_space_instance()->m_queue) {}
+SYCLHostUSMSpace::SYCLHostUSMSpace(sycl::queue queue)
+    : m_queue(std::move(queue)) {}
+
 void* allocate_sycl(
     const char* arg_label, const size_t arg_alloc_size,
     const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle,
@@ -184,6 +147,19 @@ void* SYCLSharedUSMSpace::allocate(const char* arg_label,
       sycl::usm::alloc::shared, m_queue);
 }
 
+void* SYCLHostUSMSpace::allocate(const size_t arg_alloc_size) const {
+  return allocate("[unlabeled]", arg_alloc_size);
+}
+void* SYCLHostUSMSpace::allocate(const char* arg_label,
+                                 const size_t arg_alloc_size,
+                                 const size_t arg_logical_size) const {
+  return allocate_sycl(
+      arg_label, arg_alloc_size, arg_logical_size,
+      Kokkos::Tools::make_space_handle(name()),
+      RawMemoryAllocationFailure::AllocationMechanism::SYCLMallocHost,
+      sycl::usm::alloc::host, m_queue);
+}
+
 void sycl_deallocate(const char* arg_label, void* const arg_alloc_ptr,
                      const size_t arg_alloc_size, const size_t arg_logical_size,
                      const Kokkos::Tools::SpaceHandle arg_handle,
@@ -195,6 +171,8 @@ void sycl_deallocate(const char* arg_label, void* const arg_alloc_ptr,
                                       reported_size);
   }
 
+  SYCL::impl_static_fence(
+      "Kokkos::Impl::sycl_deallocate: fence before deallocate");
   sycl::free(arg_alloc_ptr, queue);
 }
 
@@ -223,6 +201,19 @@ void SYCLSharedUSMSpace::deallocate(const char* arg_label,
                   Kokkos::Tools::make_space_handle(name()), m_queue);
 }
 
+void SYCLHostUSMSpace::deallocate(void* const arg_alloc_ptr,
+                                  const size_t arg_alloc_size) const {
+  deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size);
+}
+
+void SYCLHostUSMSpace::deallocate(const char* arg_label,
+                                  void* const arg_alloc_ptr,
+                                  const size_t arg_alloc_size,
+                                  const size_t arg_logical_size) const {
+  sycl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size,
+                  Kokkos::Tools::make_space_handle(name()), m_queue);
+}
+
 }  // namespace Experimental
 }  // namespace Kokkos
 
@@ -235,6 +226,9 @@ SharedAllocationRecord<void, void> SharedAllocationRecord<
 
 SharedAllocationRecord<void, void> SharedAllocationRecord<
     Kokkos::Experimental::SYCLSharedUSMSpace, void>::s_root_record;
+
+SharedAllocationRecord<void, void> SharedAllocationRecord<
+    Kokkos::Experimental::SYCLHostUSMSpace, void>::s_root_record;
 #endif
 
 SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>::
@@ -282,6 +276,27 @@ SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, void>::
                                                   arg_label);
 }
 
+SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace, void>::
+    SharedAllocationRecord(
+        const Kokkos::Experimental::SYCLHostUSMSpace& arg_space,
+        const std::string& arg_label, const size_t arg_alloc_size,
+        const SharedAllocationRecord<void, void>::function_type arg_dealloc)
+    // Pass through allocated [ SharedAllocationHeader , user_memory ]
+    // Pass through deallocation function
+    : base_t(
+#ifdef KOKKOS_ENABLE_DEBUG
+          &SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace,
+                                  void>::s_root_record,
+#endif
+          Impl::checked_allocation_with_header(arg_space, arg_label,
+                                               arg_alloc_size),
+          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc),
+      m_space(arg_space) {
+
+  this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
+                                                  arg_label);
+}
+
 }  // namespace Impl
 }  // namespace Kokkos
 
@@ -317,6 +332,17 @@ SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace,
                      alloc_size, alloc_size - sizeof(SharedAllocationHeader));
 }
 
+SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace,
+                       void>::~SharedAllocationRecord() {
+  const char* label = nullptr;
+  if (Kokkos::Profiling::profileLibraryLoaded()) {
+    label = RecordBase::m_alloc_ptr->m_label;
+  }
+  const auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size;
+  m_space.deallocate(label, SharedAllocationRecord<void, void>::m_alloc_ptr,
+                     alloc_size, alloc_size - sizeof(SharedAllocationHeader));
+}
+
 //----------------------------------------------------------------------------
 
 }  // namespace Impl
@@ -339,6 +365,8 @@ template class SharedAllocationRecordCommon<
     Kokkos::Experimental::SYCLDeviceUSMSpace>;
 template class SharedAllocationRecordCommon<
     Kokkos::Experimental::SYCLSharedUSMSpace>;
+template class SharedAllocationRecordCommon<
+    Kokkos::Experimental::SYCLHostUSMSpace>;
 
 }  // namespace Impl
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp
index a30cf2109a60ccc5934bfc6ee834a831c539d485..c405ad31a5fb6d9bb7abee273b9ff10c474b134c 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp
@@ -92,14 +92,12 @@ class SYCLTeamMember {
     return m_item.get_group_linear_id();
   }
   KOKKOS_INLINE_FUNCTION int league_size() const {
-    // FIXME_SYCL needs to be revised for vector_length>1.
-    return m_item.get_group_range(0);
+    return m_item.get_group_range(1);
   }
   KOKKOS_INLINE_FUNCTION int team_rank() const {
-    return m_item.get_local_linear_id();
+    return m_item.get_local_id(0);
   }
   KOKKOS_INLINE_FUNCTION int team_size() const {
-    // FIXME_SYCL needs to be revised for vector_length>1.
     return m_item.get_local_range(0);
   }
   KOKKOS_INLINE_FUNCTION void team_barrier() const { m_item.barrier(); }
@@ -109,8 +107,17 @@ class SYCLTeamMember {
   //--------------------------------------------------------------------------
 
   template <class ValueType>
-  KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& val,
-                                             const int thread_id) const {
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_arithmetic_v<ValueType>>
+  team_broadcast(ValueType& val, const int thread_id) const {
+    val = sycl::group_broadcast(m_item.get_group(), val,
+                                sycl::id<2>(thread_id, 0));
+  }
+
+  // FIXME_SYCL remove/adapt this overload once the Intel oneAPI implementation
+  // is conforming to the SYCL2020 standard (allowing trivially-copyable types)
+  template <class ValueType>
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<!std::is_arithmetic_v<ValueType>>
+  team_broadcast(ValueType& val, const int thread_id) const {
     // Wait for shared data write until all threads arrive here
     m_item.barrier(sycl::access::fence_space::local_space);
     if (m_item.get_local_id(1) == 0 &&
@@ -119,7 +126,7 @@ class SYCLTeamMember {
     }
     // Wait for shared data read until root thread writes
     m_item.barrier(sycl::access::fence_space::local_space);
-    val = *static_cast<ValueType*>(m_team_reduce);
+    val = *(static_cast<ValueType*>(m_team_reduce));
   }
 
   template <class Closure, class ValueType>
@@ -294,35 +301,43 @@ class SYCLTeamMember {
   //----------------------------------------
 
   template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
+  KOKKOS_INLINE_FUNCTION
       typename std::enable_if<is_reducer<ReducerType>::value>::type
-      vector_reduce(ReducerType const& reducer) {
+      vector_reduce(ReducerType const& reducer) const {
     vector_reduce(reducer, reducer.reference());
   }
 
   template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
+  KOKKOS_INLINE_FUNCTION
       typename std::enable_if<is_reducer<ReducerType>::value>::type
-      vector_reduce(ReducerType const& /*reducer*/,
-                    typename ReducerType::value_type& /*value*/) {
-    // FIXME_SYCL
-    Kokkos::abort("Not implemented!");
-  }
+      vector_reduce(ReducerType const& reducer,
+                    typename ReducerType::value_type& value) const {
+    const auto tidx1   = m_item.get_local_id(1);
+    const auto grange1 = m_item.get_local_range(1);
 
-  //--------------------------------------------------------------------------
-  /**\brief  Global reduction across all blocks
-   *
-   *  Return !0 if reducer contains the final value
-   */
-  template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
-      typename std::enable_if<is_reducer<ReducerType>::value, int>::type
-      global_reduce(ReducerType const& /*reducer*/,
-                    int* const /*global_scratch_flags*/,
-                    void* const /*global_scratch_space*/, void* const /*shmem*/,
-                    int const /*shmem_size*/) {
-    // FIXME_SYCL
-    Kokkos::abort("Not implemented!");
+    const auto sg = m_item.get_sub_group();
+
+    if (grange1 == 1) return;
+
+    // Intra vector lane shuffle reduction:
+    typename ReducerType::value_type tmp(value);
+    typename ReducerType::value_type tmp2 = tmp;
+
+    for (int i = grange1; (i >>= 1);) {
+      tmp2 = sg.shuffle_down(tmp, i);
+      if (static_cast<int>(tidx1) < i) {
+        reducer.join(tmp, tmp2);
+      }
+    }
+
+    // Broadcast from root lane to all other lanes.
+    // Cannot use "butterfly" algorithm to avoid the broadcast
+    // because floating point summation is not associative
+    // and thus different threads could have different results.
+
+    tmp2  = sg.shuffle(tmp, (sg.get_local_id() / grange1) * grange1);
+    value = tmp2;
+    reducer.reference() = tmp2;
   }
 
   //----------------------------------------
@@ -489,7 +504,6 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
     const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::SYCLTeamMember>&
         loop_boundaries,
     const Closure& closure) {
-  // FIXME_SYCL Fix for vector_length>1.
   for (iType i = loop_boundaries.start +
                  loop_boundaries.member.item().get_local_id(0);
        i < loop_boundaries.end;
@@ -516,7 +530,6 @@ KOKKOS_INLINE_FUNCTION
   typename ReducerType::value_type value;
   reducer.init(value);
 
-  // FIXME_SYCL Fix for vector_length>1.
   for (iType i = loop_boundaries.start +
                  loop_boundaries.member.item().get_local_id(0);
        i < loop_boundaries.end;
@@ -546,7 +559,6 @@ KOKKOS_INLINE_FUNCTION
 
   reducer.init(reducer.reference());
 
-  // FIXME_SYCL Fix for vector_length>1.
   for (iType i = loop_boundaries.start +
                  loop_boundaries.member.item().get_local_id(0);
        i < loop_boundaries.end;
@@ -609,11 +621,14 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
     const Impl::TeamVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>&
         loop_boundaries,
     const Closure& closure) {
-  // FIXME_SYCL adapt for vector_length != 1
-  for (iType i = loop_boundaries.start +
-                 loop_boundaries.member.item().get_local_id(0);
-       i < loop_boundaries.end;
-       i += loop_boundaries.member.item().get_local_range(0))
+  const iType tidx0 = loop_boundaries.member.item().get_local_id(0);
+  const iType tidx1 = loop_boundaries.member.item().get_local_id(1);
+
+  const iType grange0 = loop_boundaries.member.item().get_local_range(0);
+  const iType grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  for (iType i = loop_boundaries.start + tidx0 * grange1 + tidx1;
+       i < loop_boundaries.end; i += grange0 * grange1)
     closure(i);
 }
 
@@ -623,17 +638,20 @@ KOKKOS_INLINE_FUNCTION
     parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
                         iType, Impl::SYCLTeamMember>& loop_boundaries,
                     const Closure& closure, const ReducerType& reducer) {
-  // FIXME_SYCL adapt for vector_length != 1
   typename ReducerType::value_type value;
   reducer.init(value);
 
-  for (iType i = loop_boundaries.start +
-                 loop_boundaries.member.item().get_local_id(0);
-       i < loop_boundaries.end;
-       i += loop_boundaries.member.item().get_local_range(0)) {
+  const iType tidx0 = loop_boundaries.member.item().get_local_id(0);
+  const iType tidx1 = loop_boundaries.member.item().get_local_id(1);
+
+  const iType grange0 = loop_boundaries.member.item().get_local_range(0);
+  const iType grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  for (iType i = loop_boundaries.start + tidx0 * grange1 + tidx1;
+       i < loop_boundaries.end; i += grange0 * grange1)
     closure(i, value);
-  }
 
+  loop_boundaries.member.vector_reduce(reducer, value);
   loop_boundaries.member.team_reduce(reducer, value);
 }
 
@@ -643,20 +661,23 @@ KOKKOS_INLINE_FUNCTION
     parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
                         iType, Impl::SYCLTeamMember>& loop_boundaries,
                     const Closure& closure, ValueType& result) {
-  // FIXME_SYCL adapt for vector_length != 1
   ValueType val;
   Kokkos::Sum<ValueType> reducer(val);
 
   reducer.init(reducer.reference());
 
-  for (iType i = loop_boundaries.start +
-                 loop_boundaries.member.item().get_local_id(0);
-       i < loop_boundaries.end;
-       i += loop_boundaries.member.item().get_local_range(0)) {
+  const iType tidx0 = loop_boundaries.member.item().get_local_id(0);
+  const iType tidx1 = loop_boundaries.member.item().get_local_id(1);
+
+  const iType grange0 = loop_boundaries.member.item().get_local_range(0);
+  const iType grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  for (iType i = loop_boundaries.start + tidx0 * grange1 + tidx1;
+       i < loop_boundaries.end; i += grange0 * grange1)
     closure(i, val);
-  }
 
-  loop_boundaries.member.team_reduce(reducer, val);
+  loop_boundaries.member.vector_reduce(reducer);
+  loop_boundaries.member.team_reduce(reducer);
   result = reducer.reference();
 }
 
@@ -673,9 +694,14 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
     const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>&
         loop_boundaries,
     const Closure& closure) {
-  // FIXME_SYC: adapt for vector_length!=1
-  for (auto i = loop_boundaries.start; i != loop_boundaries.end; ++i)
+  const iType tidx1   = loop_boundaries.member.item().get_local_id(1);
+  const iType grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  for (iType i = loop_boundaries.start + tidx1; i < loop_boundaries.end;
+       i += grange1)
     closure(i);
+
+  loop_boundaries.member.item().get_sub_group().barrier();
 }
 
 //----------------------------------------------------------------------------
@@ -697,12 +723,16 @@ KOKKOS_INLINE_FUNCTION
     parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
                         iType, Impl::SYCLTeamMember> const& loop_boundaries,
                     Closure const& closure, ReducerType const& reducer) {
-  // FIXME_SYCL adapt for vector_length != 1
   reducer.init(reducer.reference());
 
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) {
+  const iType tidx1   = loop_boundaries.member.item().get_local_id(1);
+  const iType grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  for (iType i = loop_boundaries.start + tidx1; i < loop_boundaries.end;
+       i += grange1)
     closure(i, reducer.reference());
-  }
+
+  loop_boundaries.member.vector_reduce(reducer);
 }
 
 /** \brief  Intra-thread vector parallel_reduce.
@@ -722,12 +752,16 @@ KOKKOS_INLINE_FUNCTION
     parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
                         iType, Impl::SYCLTeamMember> const& loop_boundaries,
                     Closure const& closure, ValueType& result) {
-  // FIXME_SYCL adapt for vector_length != 1
   result = ValueType();
 
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) {
+  const iType tidx1 = loop_boundaries.member.item().get_local_id(1);
+  const int grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  for (iType i = loop_boundaries.start + tidx1; i < loop_boundaries.end;
+       i += grange1)
     closure(i, result);
-  }
+
+  loop_boundaries.member.vector_reduce(Kokkos::Sum<ValueType>(result));
 }
 
 //----------------------------------------------------------------------------
@@ -746,15 +780,59 @@ KOKKOS_INLINE_FUNCTION
     parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
                       iType, Impl::SYCLTeamMember>& loop_boundaries,
                   const Closure& closure, const ReducerType& reducer) {
-  // FIXME_SYCL modify for vector_length!=1
   using value_type = typename Kokkos::Impl::FunctorAnalysis<
       Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
 
   value_type accum;
   reducer.init(accum);
+  const value_type identity = accum;
+
+  // Loop through boundaries by vector-length chunks must scan at each iteration
+
+  // All thread "lanes" must loop the same number of times.
+  // Determine an loop end for all thread "lanes."
+  // Requires:
+  //   grange1 is power of two and thus
+  //     ( end % grange1 ) == ( end & ( grange1 - 1 ) )
+  //   1 <= grange1 <= sub_group size
+
+  const iType tidx1   = loop_boundaries.member.item().get_local_id(1);
+  const iType grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  const int mask          = grange1 - 1;
+  const int rem           = loop_boundaries.end & mask;  // == end % grange1
+  const int end           = loop_boundaries.end + (rem ? grange1 - rem : 0);
+  const auto sg           = loop_boundaries.member.item().get_sub_group();
+  const int vector_offset = (sg.get_local_id() / grange1) * grange1;
+
+  for (int i = tidx1; i < end; i += grange1) {
+    value_type val = identity;
+
+    // First acquire per-lane contributions.
+    // This sets i's val to i-1's contribution to make the latter shfl_up an
+    // exclusive scan -- the final accumulation of i's val will be included in
+    // the second closure call later.
+    if (i < loop_boundaries.end && tidx1 > 0) closure(i - 1, val, false);
+
+    // Bottom up exclusive scan in triangular pattern where each SYCL thread is
+    // the root of a reduction tree from the zeroth "lane" to itself.
+    //  [t] += [t-1] if t >= 1
+    //  [t] += [t-2] if t >= 2
+    //  [t] += [t-4] if t >= 4
+    //  ...
+    for (int j = 1; j < static_cast<int>(grange1); j <<= 1) {
+      value_type tmp = sg.shuffle_up(val, j);
+      if (j <= static_cast<int>(tidx1)) {
+        reducer.join(val, tmp);
+      }
+    }
+
+    // Include accumulation
+    reducer.join(val, accum);
 
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) {
-    closure(i, accum, true);
+    // Update i's contribution into the val and add it to accum for next round
+    if (i < loop_boundaries.end) closure(i, val, true);
+    accum = sg.shuffle(val, mask + vector_offset);
   }
 }
 
@@ -792,21 +870,26 @@ template <class FunctorType>
 KOKKOS_INLINE_FUNCTION void single(
     const Impl::ThreadSingleStruct<Impl::SYCLTeamMember>& single_struct,
     const FunctorType& lambda) {
-  if (single_struct.team_member.team_rank() == 0) lambda();
+  if (single_struct.team_member.item().get_local_linear_id() == 0) lambda();
 }
 
 template <class FunctorType, class ValueType>
 KOKKOS_INLINE_FUNCTION void single(
     const Impl::VectorSingleStruct<Impl::SYCLTeamMember>& single_struct,
     const FunctorType& lambda, ValueType& val) {
-  if (single_struct.team_member.item().get_local_id(1) == 0) lambda(val);
+  const sycl::nd_item<2> item = single_struct.team_member.item();
+  const auto grange1          = item.get_local_range(1);
+  const auto sg               = item.get_sub_group();
+  if (item.get_local_id(1) == 0) lambda(val);
+  val = sg.shuffle(val, (sg.get_local_id() / grange1) * grange1);
 }
 
 template <class FunctorType, class ValueType>
 KOKKOS_INLINE_FUNCTION void single(
     const Impl::ThreadSingleStruct<Impl::SYCLTeamMember>& single_struct,
     const FunctorType& lambda, ValueType& val) {
-  if (single_struct.team_member.team_rank() == 0) lambda(val);
+  if (single_struct.team_member.item().get_local_linear_id() == 0) lambda(val);
+  single_struct.team_member.team_broadcast(val, 0);
 }
 
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp
index 141a692f6090555cf129997a64bc9e99941f830d..d2820b3b3a34cdb933c4615260a73e1b82e7de34 100644
--- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp
+++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp
@@ -89,7 +89,7 @@ class UniqueToken<SYCL, UniqueTokenScope::Global> {
     const Kokkos::pair<int, int> result =
         Kokkos::Impl::concurrent_bitset::acquire_bounded(
             m_buffer, m_count
-#if defined(KOKKOS_ARCH_INTEL_GEN)
+#ifdef KOKKOS_ARCH_INTEL_GPU
             ,
             Kokkos::Impl::clock_tic() % m_count
 #endif
diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
index 92bd671bd53bf89482aee39cdd34b3391e9a01a2..18ef97ae4650ff50e4ea4a51b74ab53c88970ca4 100644
--- a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
+++ b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@@ -288,21 +288,46 @@ int ThreadsExec::in_parallel() {
   return s_current_function && (&s_threads_process != s_current_function_arg) &&
          (s_threads_process.m_pool_base || !is_process());
 }
+void ThreadsExec::fence() { internal_fence(Impl::fence_is_static::yes); }
+void ThreadsExec::fence(const std::string &name) {
+  internal_fence(name, Impl::fence_is_static::yes);
+}
+
+void ThreadsExec::internal_fence(Impl::fence_is_static is_static) {
+  internal_fence((is_static == Impl::fence_is_static::no)
+                     ? "Kokkos::ThreadsExec::fence: Unnamed Instance Fence"
+                     : "Kokkos::ThreadsExec::fence: Unnamed Global Fence",
+                 is_static);
+}
 
 // Wait for root thread to become inactive
-void ThreadsExec::fence() {
-  if (s_thread_pool_size[0]) {
-    // Wait for the root thread to complete:
-    Impl::spinwait_while_equal<int>(s_threads_exec[0]->m_pool_state,
-                                    ThreadsExec::Active);
-  }
+void ThreadsExec::internal_fence(const std::string &name,
+                                 Impl::fence_is_static is_static) {
+  const auto &fence_lam = [&]() {
+    if (s_thread_pool_size[0]) {
+      // Wait for the root thread to complete:
+      Impl::spinwait_while_equal<int>(s_threads_exec[0]->m_pool_state,
+                                      ThreadsExec::Active);
+    }
 
-  s_current_function     = nullptr;
-  s_current_function_arg = nullptr;
+    s_current_function     = nullptr;
+    s_current_function_arg = nullptr;
 
-  // Make sure function and arguments are cleared before
-  // potentially re-activating threads with a subsequent launch.
-  memory_fence();
+    // Make sure function and arguments are cleared before
+    // potentially re-activating threads with a subsequent launch.
+    memory_fence();
+  };
+  if (is_static == Impl::fence_is_static::yes) {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Threads>(
+        name,
+        Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+            GlobalDeviceSynchronization,
+        fence_lam);
+  } else {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Threads>(
+        name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1},
+        fence_lam);
+  }
 }
 
 /** \brief  Begin execution of the asynchronous functor */
@@ -769,7 +794,12 @@ void ThreadsExec::finalize() {
 namespace Kokkos {
 
 int Threads::concurrency() { return impl_thread_pool_size(0); }
-void Threads::fence() const { Impl::ThreadsExec::fence(); }
+void Threads::fence() const {
+  Impl::ThreadsExec::internal_fence(Impl::fence_is_static::no);
+}
+void Threads::fence(const std::string &name) const {
+  Impl::ThreadsExec::internal_fence(name, Impl::fence_is_static::no);
+}
 
 Threads &Threads::impl_instance(int) {
   static Threads t;
@@ -832,6 +862,9 @@ void ThreadsSpaceInitializer::finalize(const bool all_spaces) {
 }
 
 void ThreadsSpaceInitializer::fence() { Kokkos::Threads::impl_static_fence(); }
+void ThreadsSpaceInitializer::fence(const std::string &name) {
+  Kokkos::Threads::impl_static_fence(name);
+}
 
 void ThreadsSpaceInitializer::print_configuration(std::ostream &msg,
                                                   const bool detail) {
diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
index 1c8b3ac5f6a7685d2bec7d36b53fc657bf7ba1b9..4d9a72a03467977ed21867a90a84563c7254bba7 100644
--- a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
+++ b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
@@ -63,7 +63,6 @@
 
 namespace Kokkos {
 namespace Impl {
-
 class ThreadsExec {
  public:
   // Fan array has log_2(NT) reduction threads plus 2 scan threads
@@ -474,6 +473,12 @@ class ThreadsExec {
 
   static int in_parallel();
   static void fence();
+  static void fence(const std::string &);
+  static void internal_fence(
+      Impl::fence_is_static is_static = Impl::fence_is_static::yes);
+  static void internal_fence(
+      const std::string &,
+      Impl::fence_is_static is_static = Impl::fence_is_static::yes);
   static bool sleep();
   static bool wake();
 
@@ -635,7 +640,12 @@ inline void Threads::print_configuration(std::ostream &s, const bool detail) {
   Impl::ThreadsExec::print_configuration(s, detail);
 }
 
-inline void Threads::impl_static_fence() { Impl::ThreadsExec::fence(); }
+inline void Threads::impl_static_fence() {
+  Impl::ThreadsExec::internal_fence(Impl::fence_is_static::yes);
+}
+inline void Threads::impl_static_fence(const std::string &name) {
+  Impl::ThreadsExec::internal_fence(name, Impl::fence_is_static::yes);
+}
 } /* namespace Kokkos */
 
 //----------------------------------------------------------------------------
diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
index 40a09ed22ab1d6d73b62084549049521e0eb3150..e4eaeac78163efe48a2ddbd6d39920900b035c29 100644
--- a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
+++ b/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
@@ -100,8 +100,8 @@ bool ThreadsExec::spawn() {
 
   pthread_attr_t attr;
 
-  if (0 == pthread_attr_init(&attr) ||
-      0 == pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM) ||
+  if (0 == pthread_attr_init(&attr) &&
+      0 == pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM) &&
       0 == pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED)) {
     pthread_t pt;
 
diff --git a/packages/kokkos/core/src/desul/.clang-format b/packages/kokkos/core/src/desul/.clang-format
new file mode 100644
index 0000000000000000000000000000000000000000..9d159247d518108410702980b90b13c2cfb4b84f
--- /dev/null
+++ b/packages/kokkos/core/src/desul/.clang-format
@@ -0,0 +1,2 @@
+DisableFormat: true
+SortIncludes: false
diff --git a/packages/kokkos/core/src/desul/atomics.hpp b/packages/kokkos/core/src/desul/atomics.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab3fe25392faa70027cb19c2a02c18c570c5768b
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics.hpp
@@ -0,0 +1,19 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_HPP_
+#define DESUL_ATOMICS_HPP_
+
+#include "desul/atomics/Macros.hpp"
+
+#include "desul/atomics/Atomic_Ref.hpp"
+#include "desul/atomics/Compare_Exchange.hpp"
+#include "desul/atomics/Generic.hpp"
+#include "desul/atomics/Lock_Array.hpp"
+
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Atomic_Ref.hpp b/packages/kokkos/core/src/desul/atomics/Atomic_Ref.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..73cd01a7e6ff9c54b8b851193bf256124f399cfe
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Atomic_Ref.hpp
@@ -0,0 +1,541 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMIC_REF_IMPL_HPP_
+#define DESUL_ATOMIC_REF_IMPL_HPP_
+
+#include <cstddef>
+#include <memory>
+#include <type_traits>
+
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/Generic.hpp"
+#include "desul/atomics/Macros.hpp"
+
+namespace desul {
+namespace Impl {
+
+// TODO current implementation is missing the following:
+// * member functions
+//   * wait
+//   * notify_one
+//   * notify_all
+
+template <typename T,
+          typename MemoryOrder,
+          typename MemoryScope,
+          bool = std::is_integral<T>{},
+          bool = std::is_floating_point<T>{}>
+struct basic_atomic_ref;
+
+// base class for non-integral, non-floating-point, non-pointer types
+template <typename T, typename MemoryOrder, typename MemoryScope>
+struct basic_atomic_ref<T, MemoryOrder, MemoryScope, false, false> {
+  static_assert(std::is_trivially_copyable<T>{}, "");
+
+ private:
+  T* _ptr;
+
+  // 1/2/4/8/16-byte types must be aligned to at least their size
+  static constexpr int _min_alignment = (sizeof(T) & (sizeof(T) - 1)) || sizeof(T) > 16
+                                            ? 0
+                                            : sizeof(T);
+
+ public:
+  using value_type = T;
+
+  static constexpr bool is_always_lock_free = atomic_always_lock_free(sizeof(T));
+
+  static constexpr std::size_t required_alignment = _min_alignment > alignof(T)
+                                                        ? _min_alignment
+                                                        : alignof(T);
+
+  basic_atomic_ref() = delete;
+  basic_atomic_ref& operator=(basic_atomic_ref const&) = delete;
+
+  basic_atomic_ref(basic_atomic_ref const&) = default;
+
+  explicit basic_atomic_ref(T& obj) : _ptr(std::addressof(obj)) {}
+
+  T operator=(T desired) const noexcept {
+    this->store(desired);
+    return desired;
+  }
+
+  operator T() const noexcept { return this->load(); }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION void store(T desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    atomic_store(_ptr, desired, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T load(_MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T exchange(T desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, desired, order, MemoryScope());
+  }
+
+  DESUL_FUNCTION bool is_lock_free() const noexcept {
+    return atomic_is_lock_free<sizeof(T), required_alignment>();
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(T& expected,
+                                            T desired,
+                                            SuccessMemoryOrder success,
+                                            FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_weak(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(
+      T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_weak(expected,
+                          desired,
+                          order,
+                          cmpexch_failure_memory_order<_MemoryOrder>(),
+                          MemoryScope());
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T& expected,
+      T desired,
+      SuccessMemoryOrder success,
+      FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_strong(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_strong(expected,
+                            desired,
+                            order,
+                            cmpexch_failure_memory_order<_MemoryOrder>(),
+                            MemoryScope());
+  }
+};
+
+// base class for atomic_ref<integral-type>
+template <typename T, typename MemoryOrder, typename MemoryScope>
+struct basic_atomic_ref<T, MemoryOrder, MemoryScope, true, false> {
+  static_assert(std::is_integral<T>{}, "");
+
+ private:
+  T* _ptr;
+
+ public:
+  using value_type = T;
+  using difference_type = value_type;
+
+  static constexpr bool is_always_lock_free = atomic_always_lock_free(sizeof(T));
+
+  static constexpr std::size_t required_alignment = sizeof(T) > alignof(T) ? sizeof(T)
+                                                                           : alignof(T);
+
+  basic_atomic_ref() = delete;
+  basic_atomic_ref& operator=(basic_atomic_ref const&) = delete;
+
+  explicit basic_atomic_ref(T& obj) : _ptr(&obj) {}
+
+  basic_atomic_ref(basic_atomic_ref const&) = default;
+
+  T operator=(T desired) const noexcept {
+    this->store(desired);
+    return desired;
+  }
+
+  operator T() const noexcept { return this->load(); }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION void store(T desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    atomic_store(_ptr, desired, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T load(_MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T exchange(T desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, desired, order, MemoryScope());
+  }
+
+  DESUL_FUNCTION bool is_lock_free() const noexcept {
+    return atomic_is_lock_free<sizeof(T), required_alignment>();
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(T& expected,
+                                            T desired,
+                                            SuccessMemoryOrder success,
+                                            FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_weak(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(
+      T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_weak(expected,
+                          desired,
+                          order,
+                          cmpexch_failure_memory_order<_MemoryOrder>(),
+                          MemoryScope());
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T& expected,
+      T desired,
+      SuccessMemoryOrder success,
+      FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_strong(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_strong(expected,
+                            desired,
+                            order,
+                            cmpexch_failure_memory_order<_MemoryOrder>(),
+                            MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_add(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_add(_ptr, arg, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_sub(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_sub(_ptr, arg, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_and(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_and(_ptr, arg, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_or(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_or(_ptr, arg, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_xor(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_xor(_ptr, arg, order, MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator++() const noexcept {
+    return atomic_add_fetch(_ptr, value_type(1), MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator++(int) const noexcept { return fetch_add(1); }
+
+  DESUL_FUNCTION value_type operator--() const noexcept {
+    return atomic_sub_fetch(_ptr, value_type(1), MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator--(int) const noexcept { return fetch_sub(1); }
+
+  DESUL_FUNCTION value_type operator+=(value_type arg) const noexcept {
+    atomic_add_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator-=(value_type arg) const noexcept {
+    atomic_sub_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator&=(value_type arg) const noexcept {
+    atomic_and_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator|=(value_type arg) const noexcept {
+    atomic_or_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator^=(value_type arg) const noexcept {
+    atomic_xor_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+};
+
+// base class for atomic_ref<floating-point-type>
+template <typename T, typename MemoryOrder, typename MemoryScope>
+struct basic_atomic_ref<T, MemoryOrder, MemoryScope, false, true> {
+  static_assert(std::is_floating_point<T>{}, "");
+
+ private:
+  T* _ptr;
+
+ public:
+  using value_type = T;
+  using difference_type = value_type;
+
+  static constexpr bool is_always_lock_free = atomic_always_lock_free(sizeof(T));
+
+  static constexpr std::size_t required_alignment = alignof(T);
+
+  basic_atomic_ref() = delete;
+  basic_atomic_ref& operator=(basic_atomic_ref const&) = delete;
+
+  explicit basic_atomic_ref(T& obj) : _ptr(&obj) {}
+
+  basic_atomic_ref(basic_atomic_ref const&) = default;
+
+  T operator=(T desired) const noexcept {
+    this->store(desired);
+    return desired;
+  }
+
+  operator T() const noexcept { return this->load(); }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION void store(T desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    atomic_store(_ptr, desired, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T load(_MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T exchange(T desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, desired, order, MemoryScope());
+  }
+
+  DESUL_FUNCTION bool is_lock_free() const noexcept {
+    return atomic_is_lock_free<sizeof(T), required_alignment>();
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(T& expected,
+                                            T desired,
+                                            SuccessMemoryOrder success,
+                                            FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_weak(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(
+      T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_weak(expected,
+                          desired,
+                          order,
+                          cmpexch_failure_memory_order<_MemoryOrder>(),
+                          MemoryScope());
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T& expected,
+      T desired,
+      SuccessMemoryOrder success,
+      FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_strong(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_strong(expected,
+                            desired,
+                            order,
+                            cmpexch_failure_memory_order<_MemoryOrder>(),
+                            MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_add(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_add(_ptr, arg, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_sub(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_sub(_ptr, arg, order, MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator+=(value_type arg) const noexcept {
+    atomic_add_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator-=(value_type arg) const noexcept {
+    atomic_sub_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+};
+
+// base class for atomic_ref<pointer-type>
+template <typename T, typename MemoryOrder, typename MemoryScope>
+struct basic_atomic_ref<T*, MemoryOrder, MemoryScope, false, false> {
+ private:
+  T** _ptr;
+
+ public:
+  using value_type = T*;
+  using difference_type = std::ptrdiff_t;
+
+  static constexpr bool is_always_lock_free = atomic_always_lock_free(sizeof(T));
+
+  static constexpr std::size_t required_alignment = alignof(T*);
+
+  basic_atomic_ref() = delete;
+  basic_atomic_ref& operator=(basic_atomic_ref const&) = delete;
+
+  explicit basic_atomic_ref(T*& arg) : _ptr(std::addressof(arg)) {}
+
+  basic_atomic_ref(basic_atomic_ref const&) = default;
+
+  T* operator=(T* desired) const noexcept {
+    this->store(desired);
+    return desired;
+  }
+
+  operator T*() const noexcept { return this->load(); }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION void store(T* desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    atomic_store(_ptr, desired, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T* load(_MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T* exchange(T* desired,
+                             _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, desired, order, MemoryScope());
+  }
+
+  DESUL_FUNCTION bool is_lock_free() const noexcept {
+    return atomic_is_lock_free<sizeof(T*), required_alignment>();
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(T*& expected,
+                                            T* desired,
+                                            SuccessMemoryOrder success,
+                                            FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_weak(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(
+      T*& expected, T* desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_weak(expected,
+                          desired,
+                          order,
+                          cmpexch_failure_memory_order<_MemoryOrder>(),
+                          MemoryScope());
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T*& expected,
+      T* desired,
+      SuccessMemoryOrder success,
+      FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_strong(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T*& expected, T* desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_strong(expected,
+                            desired,
+                            order,
+                            cmpexch_failure_memory_order<_MemoryOrder>(),
+                            MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_add(difference_type d, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_add(_ptr, _type_size(d), order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_sub(difference_type d, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_sub(_ptr, _type_size(d), order, MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator++() const noexcept {
+    return atomic_add_fetch(_ptr, _type_size(1), MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator++(int) const noexcept { return fetch_add(1); }
+
+  DESUL_FUNCTION value_type operator--() const noexcept {
+    return atomic_sub_fetch(_ptr, _type_size(1), MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator--(int) const noexcept { return fetch_sub(1); }
+
+  DESUL_FUNCTION value_type operator+=(difference_type d) const noexcept {
+    atomic_add_fetch(_ptr, _type_size(d), MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator-=(difference_type d) const noexcept {
+    atomic_sub_fetch(_ptr, _type_size(d), MemoryOrder(), MemoryScope());
+  }
+
+ private:
+  static constexpr std::ptrdiff_t _type_size(std::ptrdiff_t d) noexcept {
+    static_assert(std::is_object<T>{}, "");
+    return d * sizeof(T);
+  }
+};
+
+}  // namespace Impl
+
+template <typename T, typename MemoryOrder, typename MemoryScope>
+struct scoped_atomic_ref : Impl::basic_atomic_ref<T, MemoryOrder, MemoryScope> {
+  explicit scoped_atomic_ref(T& obj) noexcept
+      : Impl::basic_atomic_ref<T, MemoryOrder, MemoryScope>(obj) {}
+
+  scoped_atomic_ref& operator=(scoped_atomic_ref const&) = delete;
+
+  scoped_atomic_ref(scoped_atomic_ref const&) = default;
+
+  using Impl::basic_atomic_ref<T, MemoryOrder, MemoryScope>::operator=;
+};
+
+}  // namespace desul
+
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/CUDA.hpp b/packages/kokkos/core/src/desul/atomics/CUDA.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..32873a59776b07dea770c193e0034c1e82387246
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/CUDA.hpp
@@ -0,0 +1,453 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_CUDA_HPP_
+#define DESUL_ATOMICS_CUDA_HPP_
+
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+// When building with clang we need to include the device functions always
+// since clang must see a consistent overload set in both device and host compilation
+// but that means we need to know on the host what to make visible, i.e. we need
+// a host side compile knowledge of architecture.
+// We simply can say DESUL proper doesn't support clang CUDA build pre Volta,
+// Kokkos has that knowledge and so I use it here, allowing in Kokkos to use
+// clang with pre Volta as CUDA compiler
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__>=700)) || \
+    (!defined(__NVCC__) && !defined(KOKKOS_ARCH_KEPLER) && !defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL))
+#define DESUL_HAVE_CUDA_ATOMICS_ASM
+#include <desul/atomics/cuda/CUDA_asm.hpp>
+#endif
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__<700)) || \
+    (!defined(__NVCC__) && !defined(DESUL_HAVE_CUDA_ATOMICS_ASM))
+namespace desul {
+namespace Impl {
+template<class T>
+struct is_cuda_atomic_integer_type {
+  static constexpr bool value = std::is_same<T,int>::value ||
+                                std::is_same<T,unsigned int>::value ||
+                                std::is_same<T,unsigned long long int>::value;
+};
+
+template<class T>
+struct is_cuda_atomic_add_type {
+  static constexpr bool value = is_cuda_atomic_integer_type<T>::value ||
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
+                                std::is_same<T,double>::value || 
+#endif
+                                std::is_same<T,float>::value;
+};
+
+template<class T>
+struct is_cuda_atomic_sub_type {
+  static constexpr bool value = std::is_same<T,int>::value ||
+                                std::is_same<T,unsigned int>::value;
+};
+} // Impl
+
+// Atomic Add
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_add_type<T>::value,T>::type
+atomic_fetch_add(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicAdd(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_add_type<T>::value,T>::type
+atomic_fetch_add(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicAdd(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_add_type<T>::value,T>::type
+atomic_fetch_add(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_add(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+
+// Atomic Sub
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_sub_type<T>::value,T>::type
+atomic_fetch_sub(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicSub(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_sub_type<T>::value,T>::type
+atomic_fetch_sub(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicSub(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_sub_type<T>::value,T>::type
+atomic_fetch_sub(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_sub(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+// Atomic Inc
+__device__ inline
+unsigned int atomic_fetch_inc(unsigned int* dest, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicInc(dest,val);
+}
+
+template<class MemoryOrder>
+__device__ inline
+unsigned int atomic_fetch_inc(unsigned int* dest, unsigned int val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  unsigned int return_val = atomicInc(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class MemoryOrder>
+__device__ inline
+unsigned int atomic_fetch_inc(unsigned int* dest, unsigned int val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_inc(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+// Atomic Inc
+__device__ inline
+unsigned int atomic_fetch_dec(unsigned int* dest, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicDec(dest,val);
+}
+
+template<class MemoryOrder>
+__device__ inline
+unsigned int atomic_fetch_dec(unsigned int* dest, unsigned int val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  unsigned int return_val = atomicDec(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class MemoryOrder>
+__device__ inline
+unsigned int atomic_fetch_dec(unsigned int* dest, unsigned int val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_dec(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+
+// Atomic Max
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_max(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicMax(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_max(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicMax(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_max(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_max(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+// Atomic Min
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_min(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicMin(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_min(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicMin(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_min(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_min(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+// Atomic And
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_and(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicAnd(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_and(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicAnd(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_and(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_and(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+// Atomic XOR
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_xor(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicXor(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_xor(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicXor(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_xor(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_xor(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+// Atomic OR
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_or(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicOr(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_or(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicOr(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_or(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_or(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+} // desul
+#endif
+
+#if !defined(__NVCC__)
+// Functions defined as device functions in CUDA which don't exist in the GCC overload set
+namespace desul {
+
+#if defined(DESUL_HAVE_CUDA_ATOMICS_ASM)
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(TYPE,ORDER,SCOPE) \
+    inline void atomic_add(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+    (void) atomic_fetch_add(dest, val, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(int32_t,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(long,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
+  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(unsigned long long,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(float,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(double,MemoryOrderRelaxed,MemoryScopeDevice);
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(TYPE,ORDER,SCOPE) \
+    inline void atomic_sub(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+    (void) atomic_fetch_sub(dest, val, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(int32_t,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(long,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
+  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(float,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(double,MemoryOrderRelaxed,MemoryScopeDevice);
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_INC(TYPE,ORDER,SCOPE) \
+    inline void atomic_inc(TYPE* const dest, ORDER order, SCOPE scope) { \
+    (void) atomic_fetch_inc(dest, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_INC(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_DEC(TYPE,ORDER,SCOPE) \
+    inline void atomic_dec(TYPE* const dest, ORDER order, SCOPE scope) { \
+    (void) atomic_fetch_dec(dest, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_DEC(unsigned,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
+#endif // DESUL_HAVE_CUDA_ATOMICS_ASM
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_ADD(TYPE,ORDER,SCOPE) \
+    inline TYPE atomic_fetch_add(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+      return Impl::atomic_fetch_oper(Impl::AddOper<TYPE, const TYPE>(),dest, val, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_ADD(float,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_ADD(double,MemoryOrderRelaxed,MemoryScopeDevice);
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_SUB(TYPE,ORDER,SCOPE) \
+    inline TYPE atomic_fetch_sub(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+      return Impl::atomic_fetch_oper(Impl::SubOper<TYPE, const TYPE>(),dest, val, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_SUB(float,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_SUB(double,MemoryOrderRelaxed,MemoryScopeDevice);
+
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(TYPE,ORDER,SCOPE) \
+    inline TYPE atomic_fetch_max(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+      return Impl::atomic_fetch_oper(Impl::MaxOper<TYPE, const TYPE>(), dest, val, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(int,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(long,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(unsigned long,MemoryOrderRelaxed,MemoryScopeDevice);
+//  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(unsigned long long,MemoryOrderRelaxed,MemoryScopeDevice);
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(TYPE,ORDER,SCOPE) \
+    inline TYPE atomic_fetch_min(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+      return Impl::atomic_fetch_oper(Impl::MinOper<TYPE, const TYPE>(), dest, val, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(int,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(long,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(unsigned long,MemoryOrderRelaxed,MemoryScopeDevice);
+//  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(unsigned long long,MemoryOrderRelaxed,MemoryScopeDevice);
+//  inline void atomic_fetch_max(int32_t* const dest, int32_t val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+
+}
+
+// Functions defined int the GCC overload set but not in the device overload set
+namespace desul {
+  __device__ inline
+  unsigned long long atomic_fetch_add(unsigned long long* const dest, unsigned long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::AddOper<unsigned long long, const unsigned long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_fetch_add(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::AddOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_add(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::AddOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_fetch_sub(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::SubOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_sub(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::SubOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_max(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::MaxOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_min(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::MinOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_or(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::OrOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_fetch_or(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::OrOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_xor(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::XorOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_fetch_xor(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::XorOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_and(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::AndOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_fetch_and(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::AndOper<long long, const long long>(), dest, val, order, scope);
+  }
+
+
+  __device__ inline
+  unsigned long long atomic_add_fetch(unsigned long long* const dest, unsigned long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::AddOper<unsigned long long, const unsigned long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_add_fetch(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::AddOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_add_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::AddOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_sub_fetch(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::SubOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_sub_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::SubOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_or_fetch(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::OrOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_or_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::OrOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_xor_fetch(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::XorOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_xor_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::XorOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_and_fetch(long long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::AndOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_and_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::AndOper<long, const long>(), dest, val, order, scope);
+  }
+}
+#endif
+#endif  // DESUL_HAVE_CUDA_ATOMICS
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Common.hpp b/packages/kokkos/core/src/desul/atomics/Common.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f1dccc6c52318f58b6fb1ed792ed614a8351458c
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Common.hpp
@@ -0,0 +1,199 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMMON_HPP_
+#define DESUL_ATOMICS_COMMON_HPP_
+#include "desul/atomics/Macros.hpp"
+#include <cstdint>
+#include <atomic>
+#include <type_traits>
+
+namespace desul {
+struct alignas(16) Dummy16ByteValue {
+  int64_t value1;
+  int64_t value2;
+  bool operator!=(Dummy16ByteValue v) const {
+    return (value1 != v.value1) || (value2 != v.value2);
+  }
+  bool operator==(Dummy16ByteValue v) const {
+    return (value1 == v.value1) && (value2 == v.value2);
+  }
+};
+}  // namespace desul
+
+// MemoryOrder Tags
+
+namespace desul {
+// Memory order sequential consistent
+struct MemoryOrderSeqCst {};
+// Memory order acquire release
+struct MemoryOrderAcqRel {};
+// Memory order acquire
+struct MemoryOrderAcquire {};
+// Memory order release
+struct MemoryOrderRelease {};
+// Memory order relaxed
+struct MemoryOrderRelaxed {};
+}  // namespace desul
+
+// Memory Scope Tags
+
+namespace desul {
+// Entire machine scope (e.g. for global arrays)
+struct MemoryScopeSystem {};
+// Node level
+struct MemoryScopeNode {};
+// Device or socket scope (i.e. a CPU socket, a single GPU)
+struct MemoryScopeDevice {};
+// Core scoped (i.e. a shared Level 1 cache)
+struct MemoryScopeCore {};
+}  // namespace desul
+
+#ifndef __ATOMIC_RELAXED
+#define __ATOMIC_RELAXED 0
+#define __ATOMIC_CONSUME 1
+#define __ATOMIC_ACQUIRE 2
+#define __ATOMIC_RELEASE 3
+#define __ATOMIC_ACQ_REL 4
+#define __ATOMIC_SEQ_CST 5
+#endif
+
+namespace desul {
+template <class MemoryOrderDesul>
+struct GCCMemoryOrder;
+
+template <>
+struct GCCMemoryOrder<MemoryOrderRelaxed> {
+  static constexpr int value = __ATOMIC_RELAXED;
+};
+
+template <>
+struct GCCMemoryOrder<MemoryOrderAcquire> {
+  static constexpr int value = __ATOMIC_ACQUIRE;
+};
+
+template <>
+struct GCCMemoryOrder<MemoryOrderRelease> {
+  static constexpr int value = __ATOMIC_RELEASE;
+};
+
+template <>
+struct GCCMemoryOrder<MemoryOrderAcqRel> {
+  static constexpr int value = __ATOMIC_ACQ_REL;
+};
+
+template <>
+struct GCCMemoryOrder<MemoryOrderSeqCst> {
+  static constexpr int value = __ATOMIC_SEQ_CST;
+};
+
+template <class MemoryOrderDesul>
+struct CXXMemoryOrder;
+
+template <>
+struct CXXMemoryOrder<MemoryOrderRelaxed> {
+  static constexpr std::memory_order value = std::memory_order_relaxed;
+};
+
+template <>
+struct CXXMemoryOrder<MemoryOrderAcquire> {
+  static constexpr std::memory_order value = std::memory_order_acquire;
+};
+
+template <>
+struct CXXMemoryOrder<MemoryOrderRelease> {
+  static constexpr std::memory_order value = std::memory_order_release;
+};
+
+template <>
+struct CXXMemoryOrder<MemoryOrderAcqRel> {
+  static constexpr std::memory_order value = std::memory_order_acq_rel;
+};
+
+template <>
+struct CXXMemoryOrder<MemoryOrderSeqCst> {
+  static constexpr std::memory_order value = std::memory_order_seq_cst;
+};
+
+namespace Impl {
+template <typename MemoryOrder>
+struct CmpExchFailureOrder {
+  using memory_order = std::conditional_t<
+      std::is_same<MemoryOrder, MemoryOrderAcqRel>{},
+      MemoryOrderAcquire,
+      std::conditional_t<std::is_same<MemoryOrder, MemoryOrderRelease>{},
+                         MemoryOrderRelaxed,
+                         MemoryOrder>>;
+};
+template <typename MemoryOrder>
+using cmpexch_failure_memory_order =
+    typename CmpExchFailureOrder<MemoryOrder>::memory_order;
+}  // namespace Impl
+
+}
+
+// We should in principle use std::numeric_limits, but that requires constexpr function support on device
+// Currently that is still considered experimetal on CUDA and sometimes not reliable.
+namespace desul {
+namespace Impl {
+template<class T>
+struct numeric_limits_max;
+
+template<>
+struct numeric_limits_max<uint32_t> {
+  static constexpr uint32_t value = 0xffffffffu;
+};
+template<>
+struct numeric_limits_max<uint64_t> {
+  static constexpr uint64_t value = 0xfffffffflu;
+};
+
+constexpr bool atomic_always_lock_free(std::size_t size) {
+  return size == 4 || size == 8
+#if defined(DESUL_HAVE_16BYTE_COMPARE_AND_SWAP)
+         || size == 16
+#endif
+      ;
+}
+
+template <std::size_t Size, std::size_t Align>
+DESUL_INLINE_FUNCTION bool atomic_is_lock_free() noexcept {
+  return Size == 4 || Size == 8
+#if defined(DESUL_HAVE_16BYTE_COMPARE_AND_SWAP)
+         || Size == 16
+#endif
+      ;
+}
+
+template<std::size_t N>
+struct atomic_compare_exchange_type;
+
+template<>
+struct atomic_compare_exchange_type<4> {
+  using type = int32_t;
+};
+
+template<>
+struct atomic_compare_exchange_type<8> {
+  using type = int64_t;
+};
+
+template<>
+struct atomic_compare_exchange_type<16> {
+  using type = Dummy16ByteValue;
+};
+
+template<class T>
+struct dont_deduce_this_parameter { using type = T; };
+
+template<class T>
+using dont_deduce_this_parameter_t = typename dont_deduce_this_parameter<T>::type;
+
+}
+}
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange.hpp b/packages/kokkos/core/src/desul/atomics/Compare_Exchange.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7b8289d75b8e70a1097207418a5a0f435913cded
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Compare_Exchange.hpp
@@ -0,0 +1,35 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_HPP_
+
+#include "desul/atomics/Macros.hpp"
+
+#ifdef DESUL_HAVE_GCC_ATOMICS
+#include "desul/atomics/Compare_Exchange_GCC.hpp"
+#endif
+#ifdef DESUL_HAVE_MSVC_ATOMICS
+#include "desul/atomics/Compare_Exchange_MSVC.hpp"
+#endif
+#ifdef DESUL_HAVE_SERIAL_ATOMICS
+#include "desul/atomics/Compare_Exchange_Serial.hpp"
+#endif
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+#include "desul/atomics/Compare_Exchange_CUDA.hpp"
+#endif
+#ifdef DESUL_HAVE_HIP_ATOMICS
+#include "desul/atomics/Compare_Exchange_HIP.hpp"
+#endif
+#ifdef DESUL_HAVE_OPENMP_ATOMICS
+#include "desul/atomics/Compare_Exchange_OpenMP.hpp"
+#endif
+#ifdef DESUL_HAVE_SYCL_ATOMICS
+#include "desul/atomics/Compare_Exchange_SYCL.hpp"
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_CUDA.hpp b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_CUDA.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..aab0d943eb659f9c0f860fef1293753bcf5c52be
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_CUDA.hpp
@@ -0,0 +1,267 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_CUDA_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_CUDA_HPP_
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/Lock_Array_Cuda.hpp"
+
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+namespace desul {
+// Only include if compiling device code, or the CUDA compiler is not NVCC (i.e. Clang)
+// atomic_thread_fence implementation
+#if defined(__CUDA_ARCH__) || !defined(__NVCC__)
+__device__ inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeDevice) {
+  __threadfence();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeDevice) {
+  __threadfence();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeDevice) {
+  __threadfence();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeDevice) {
+  __threadfence();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeCore) {
+  __threadfence_block();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeCore) {
+  __threadfence_block();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeCore) {
+  __threadfence_block();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeCore) {
+  __threadfence_block();
+}
+#if (__CUDA_ARCH__>=600) || !defined(__NVCC__)
+__device__ inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeNode) {
+  __threadfence_system();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeNode) {
+  __threadfence_system();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeNode) {
+  __threadfence_system();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeNode) {
+  __threadfence_system();
+}
+#endif
+#endif
+}
+
+// Compare Exchange for PRE Volta, not supported with CLANG as CUDA compiler, since we do NOT have a way
+// of having the code included for clang only when the CC is smaller than 700
+// But on Clang the device side symbol list must be independent of __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700) || \
+(!defined(__NVCC__) && (defined(KOKKOS_ENABLE_KEPLER) || defined(KOKKOS_ENABLE_MAXWELL) || defined(KOKKOS_ENABLE_PASCAL)))
+namespace desul {
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4, "this function assumes an unsigned int is 32-bit");
+  unsigned int return_val = atomicCAS(reinterpret_cast<unsigned int*>(dest),
+                                      reinterpret_cast<unsigned int&>(compare),
+                                      reinterpret_cast<unsigned int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8, "this function assumes an unsigned long long  is 64-bit");
+  unsigned long long int return_val =
+      atomicCAS(reinterpret_cast<unsigned long long int*>(dest),
+                reinterpret_cast<unsigned long long int&>(compare),
+                reinterpret_cast<unsigned long long int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelease, MemoryScope) {
+  T return_val = atomic_compare_exchange(dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderAcquire, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_compare_exchange(dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderAcqRel, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_compare_exchange(dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4, "this function assumes an unsigned int is 32-bit");
+  unsigned int return_val = atomicExch(reinterpret_cast<unsigned int*>(dest),
+                                       reinterpret_cast<unsigned int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8, "this function assumes an unsigned long long  is 64-bit");
+  unsigned long long int return_val =
+      atomicExch(reinterpret_cast<unsigned long long int*>(dest),
+                 reinterpret_cast<unsigned long long int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderRelease, MemoryScope) {
+  T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderAcquire, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderAcqRel, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return reinterpret_cast<T&>(return_val);
+}
+}  // namespace desul
+#endif
+
+// Including CUDA ptx based exchange atomics
+// When building with clang we need to include the device functions always
+// since clang must see a consistent overload set in both device and host compilation
+// but that means we need to know on the host what to make visible, i.e. we need
+// a host side compile knowledge of architecture.
+// We simply can say DESUL proper doesn't support clang CUDA build pre Volta,
+// Kokkos has that knowledge and so I use it here, allowing in Kokkos to use
+// clang with pre Volta as CUDA compiler
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__>=700)) || \
+     (!defined(__NVCC__) && !defined(KOKKOS_ARCH_KEPLER) && !defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL))
+#include <desul/atomics/cuda/CUDA_asm_exchange.hpp>
+#endif
+
+// SeqCst is not directly supported by PTX, need the additional fences:
+
+#if defined(__CUDA_ARCH__) || !defined(__NVCC__)
+namespace desul {
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderSeqCst, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_exchange(dest,value,MemoryOrderRelaxed(),MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderSeqCst, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_exchange(dest,value,MemoryOrderRelaxed(),MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_compare_exchange(dest,compare,value,MemoryOrderRelaxed(),MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_compare_exchange(dest,compare,value,MemoryOrderRelaxed(),MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+}
+#endif
+
+#if defined(__CUDA_ARCH__) || !defined(__NVCC__)
+namespace desul {
+template <typename T, class MemoryOrder, class MemoryScope>
+__device__ typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrder, MemoryScope scope) {
+  // This is a way to avoid dead lock in a warp or wave front
+  T return_val;
+  int done = 0;
+  unsigned int mask = DESUL_IMPL_ACTIVEMASK;
+  unsigned int active = DESUL_IMPL_BALLOT_MASK(mask, 1);
+  unsigned int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_cuda((void*)dest, scope)) {
+        if(std::is_same<MemoryOrder,MemoryOrderSeqCst>::value) atomic_thread_fence(MemoryOrderRelease(),scope);
+        atomic_thread_fence(MemoryOrderAcquire(),scope);
+        return_val = *dest;
+        if(return_val == compare) {
+          *dest = value;
+          atomic_thread_fence(MemoryOrderRelease(),scope);
+        }
+        Impl::unlock_address_cuda((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(mask, done);
+  }
+  return return_val;
+}
+template <typename T, class MemoryOrder, class MemoryScope>
+__device__ typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrder, MemoryScope scope) {
+  // This is a way to avoid dead lock in a warp or wave front
+  T return_val;
+  int done = 0;
+  unsigned int mask = DESUL_IMPL_ACTIVEMASK;
+  unsigned int active = DESUL_IMPL_BALLOT_MASK(mask, 1);
+  unsigned int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_cuda((void*)dest, scope)) {
+        if(std::is_same<MemoryOrder,MemoryOrderSeqCst>::value) atomic_thread_fence(MemoryOrderRelease(),scope);
+        atomic_thread_fence(MemoryOrderAcquire(),scope);
+        return_val = *dest;
+        *dest = value;
+        atomic_thread_fence(MemoryOrderRelease(),scope);
+        Impl::unlock_address_cuda((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(mask, done);
+  }
+  return return_val;
+}
+}
+#endif
+
+
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_GCC.hpp b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_GCC.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..418bea0b8b72f42883cc582bd58a5a170f738fea
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_GCC.hpp
@@ -0,0 +1,91 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_GCC_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_GCC_HPP_
+#include "desul/atomics/Common.hpp"
+
+#ifdef DESUL_HAVE_GCC_ATOMICS
+#if !defined(DESUL_HAVE_16BYTE_COMPARE_AND_SWAP) && !defined(__CUDACC__)
+// This doesn't work in WSL??
+//#define DESUL_HAVE_16BYTE_COMPARE_AND_SWAP
+#endif
+namespace desul {
+
+namespace Impl {
+template<class T>
+struct atomic_exchange_available_gcc {
+  constexpr static bool value =
+#ifndef DESUL_HAVE_LIBATOMIC
+    ((sizeof(T)==4 && alignof(T)==4) ||
+#ifdef DESUL_HAVE_16BYTE_COMPARE_AND_SWAP
+     (sizeof(T)==16 && alignof(T)==16) ||
+#endif
+     (sizeof(T)==8 && alignof(T)==8)) &&
+#endif
+    std::is_trivially_copyable<T>::value;
+};
+} //namespace Impl
+
+#if defined(__clang__) && (__clang_major__>=7) && !defined(__APPLE__)
+// Disable warning for large atomics on clang 7 and up (checked with godbolt)
+// error: large atomic operation may incur significant performance penalty [-Werror,-Watomic-alignment]
+// https://godbolt.org/z/G7YhqhbG6
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Watomic-alignment"
+#endif
+template<class MemoryOrder, class MemoryScope>
+void atomic_thread_fence(MemoryOrder, MemoryScope) {
+  __atomic_thread_fence(GCCMemoryOrder<MemoryOrder>::value);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+std::enable_if_t<Impl::atomic_exchange_available_gcc<T>::value, T>
+atomic_exchange(
+    T* dest, T value, MemoryOrder, MemoryScope) {
+  T return_val;
+  __atomic_exchange(
+     dest, &value, &return_val, GCCMemoryOrder<MemoryOrder>::value);
+  return return_val;
+}
+
+// Failure mode for atomic_compare_exchange_n cannot be RELEASE nor ACQREL so
+// Those two get handled separatly.
+template <typename T, class MemoryOrder, class MemoryScope>
+std::enable_if_t<Impl::atomic_exchange_available_gcc<T>::value, T>
+atomic_compare_exchange(
+    T* dest, T compare, T value, MemoryOrder, MemoryScope) {
+  (void)__atomic_compare_exchange(
+      dest, &compare, &value, false, GCCMemoryOrder<MemoryOrder>::value, GCCMemoryOrder<MemoryOrder>::value);
+  return compare;
+}
+
+template <typename T, class MemoryScope>
+std::enable_if_t<Impl::atomic_exchange_available_gcc<T>::value, T>
+atomic_compare_exchange(
+    T* dest, T compare, T value, MemoryOrderRelease, MemoryScope) {
+  (void)__atomic_compare_exchange(
+      dest, &compare, &value, false, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
+  return compare;
+}
+
+template <typename T, class MemoryScope>
+std::enable_if_t<Impl::atomic_exchange_available_gcc<T>::value, T>
+atomic_compare_exchange(
+    T* dest, T compare, T value, MemoryOrderAcqRel, MemoryScope) {
+  (void)__atomic_compare_exchange(
+      dest, &compare, &value, false, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE);
+  return compare;
+}
+
+#if defined(__clang__) && (__clang_major__>=7) && !defined(__APPLE__)
+#pragma GCC diagnostic pop
+#endif
+}  // namespace desul
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_HIP.hpp b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_HIP.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d6bf04a7e6d25449934f4813c936bf37ce9bb07b
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_HIP.hpp
@@ -0,0 +1,253 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_HIP_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_HIP_HPP_
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/Lock_Array_HIP.hpp"
+
+#ifdef DESUL_HAVE_HIP_ATOMICS
+namespace desul {
+#if defined(__HIP_DEVICE_COMPILE__)
+inline __device__ void atomic_thread_fence(MemoryOrderRelease, MemoryScopeDevice) {
+  __threadfence();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeDevice) {
+  __threadfence();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeDevice) {
+  __threadfence();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeDevice) {
+  __threadfence();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderRelease, MemoryScopeCore) {
+  __threadfence_block();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeCore) {
+  __threadfence_block();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeCore) {
+  __threadfence_block();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeCore) {
+  __threadfence_block();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderRelease, MemoryScopeNode) {
+  __threadfence_system();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeNode) {
+  __threadfence_system();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeNode) {
+  __threadfence_system();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeNode) {
+  __threadfence_system();
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4,
+                "this function assumes an unsigned int is 32-bit");
+  unsigned int return_val = atomicCAS(reinterpret_cast<unsigned int*>(dest),
+                                      reinterpret_cast<unsigned int&>(compare),
+                                      reinterpret_cast<unsigned int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8,
+                "this function assumes an unsigned long long  is 64-bit");
+  unsigned long long int return_val =
+      atomicCAS(reinterpret_cast<unsigned long long int*>(dest),
+                reinterpret_cast<unsigned long long int&>(compare),
+                reinterpret_cast<unsigned long long int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelease, MemoryScope) {
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderAcquire, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderAcqRel, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4,
+                "this function assumes an unsigned int is 32-bit");
+  unsigned int return_val = atomicExch(reinterpret_cast<unsigned int*>(dest),
+                                       reinterpret_cast<unsigned int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8,
+                "this function assumes an unsigned long long  is 64-bit");
+  unsigned long long int return_val =
+      atomicExch(reinterpret_cast<unsigned long long int*>(dest),
+                 reinterpret_cast<unsigned long long int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_exchange(T* const dest, T compare, T value, MemoryOrderRelease, MemoryScope) {
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_exchange(
+    T* const dest, T /*compare*/, T value, MemoryOrderAcquire, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_exchange(T* const dest, T value, MemoryOrderAcqRel, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_exchange(T* const dest, T value, MemoryOrderSeqCst, MemoryScope) {
+          atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+            T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+              atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+                return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION __device__
+    typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type
+    atomic_compare_exchange(
+        T* const dest, T compare, T value, MemoryOrder, MemoryScope scope) {
+  // This is a way to avoid dead lock in a warp or wave front
+  T return_val;
+  int done = 0;
+  unsigned long long int active = DESUL_IMPL_BALLOT_MASK(1);
+  unsigned long long int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_hip((void*)dest, scope)) {
+        if (std::is_same<MemoryOrder, MemoryOrderSeqCst>::value)
+          atomic_thread_fence(MemoryOrderRelease(), scope);
+        atomic_thread_fence(MemoryOrderAcquire(), scope);
+        return_val = *dest;
+        if (return_val == compare) {
+          *dest = value;
+          atomic_thread_fence(MemoryOrderRelease(), scope);
+        }
+        Impl::unlock_address_hip((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(done);
+  }
+  return return_val;
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION __device__
+    typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type
+    atomic_exchange(T* const dest, T value, MemoryOrder, MemoryScope scope) {
+  // This is a way to avoid dead lock in a warp or wave front
+  T return_val;
+  int done = 0;
+  unsigned long long int active = DESUL_IMPL_BALLOT_MASK(1);
+  unsigned long long int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_hip((void*)dest, scope)) {
+        if (std::is_same<MemoryOrder, MemoryOrderSeqCst>::value)
+          atomic_thread_fence(MemoryOrderRelease(), scope);
+        atomic_thread_fence(MemoryOrderAcquire(), scope);
+        return_val = *dest;
+        *dest = value;
+        atomic_thread_fence(MemoryOrderRelease(), scope);
+        Impl::unlock_address_hip((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(done);
+  }
+  return return_val;
+}
+#endif
+}  // namespace desul
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_MSVC.hpp b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_MSVC.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c96cb031714f63b5039ade535077c7511838ffbd
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_MSVC.hpp
@@ -0,0 +1,201 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_MSVC_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_MSVC_HPP_
+#include "desul/atomics/Common.hpp"
+#include <type_traits>
+#ifdef DESUL_HAVE_MSVC_ATOMICS
+
+#ifndef DESUL_HAVE_16BYTE_COMPARE_AND_SWAP
+#define DESUL_HAVE_16BYTE_COMPARE_AND_SWAP
+#endif
+
+namespace desul {
+
+template<class T, class MemoryOrder, class MemoryScope>
+T atomic_exchange(T* const, T val, MemoryOrder, MemoryScope) { return val;}
+
+
+template<class MemoryOrder, class MemoryScope>
+void atomic_thread_fence(MemoryOrder, MemoryScope) {
+  std::atomic_thread_fence(CXXMemoryOrder<MemoryOrder>::value);
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 1, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderRelaxed, MemoryScope) {
+  char return_val =
+      _InterlockedExchange8((char*)dest, *((char*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 2, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderRelaxed, MemoryScope) {
+  short return_val =
+      _InterlockedExchange16((short*)dest, *((short*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderRelaxed, MemoryScope) {
+  long return_val =
+      _InterlockedExchange((long*)dest, *((long*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderRelaxed, MemoryScope) {
+  __int64 return_val = _InterlockedExchange64(
+      (__int64*)dest, *((__int64*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 1, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderSeqCst, MemoryScope) {
+  char return_val =
+      _InterlockedExchange8((char*)dest, *((char*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 2, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderSeqCst, MemoryScope) {
+  short return_val =
+      _InterlockedExchange16((short*)dest, *((short*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderSeqCst, MemoryScope) {
+  long return_val =
+      _InterlockedExchange((long*)dest, *((long*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderSeqCst, MemoryScope) {
+  __int64 return_val = _InterlockedExchange64(
+      (__int64*)dest, *((__int64*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 1, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderRelaxed, MemoryScope) {
+  char return_val =
+      _InterlockedCompareExchange8((char*)dest, *((char*)&val), *((char*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 2, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderRelaxed, MemoryScope) {
+  short return_val =
+      _InterlockedCompareExchange16((short*)dest, *((short*)&val), *((short*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderRelaxed, MemoryScope) {
+  long return_val =
+      _InterlockedCompareExchange((long*)dest, *((long*)&val), *((long*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderRelaxed, MemoryScope) {
+  __int64 return_val = _InterlockedCompareExchange64(
+      (__int64*)dest, *((__int64*)&val), *((__int64*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 16, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderRelaxed, MemoryScope) {
+  Dummy16ByteValue* val16 = reinterpret_cast<Dummy16ByteValue*>(&val);
+  (void)_InterlockedCompareExchange128(reinterpret_cast<__int64*>(dest),
+                                       val16->value2,
+                                       val16->value1,
+                                       (reinterpret_cast<__int64*>(&compare)));
+  return compare;
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 1, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderSeqCst, MemoryScope) {
+  char return_val =
+      _InterlockedCompareExchange8((char*)dest, *((char*)&val), *((char*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 2, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderSeqCst, MemoryScope) {
+  short return_val =
+      _InterlockedCompareExchange16((short*)dest, *((short*)&val), *((short*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderSeqCst, MemoryScope) {
+  long return_val =
+      _InterlockedCompareExchange((long*)dest, *((long*)&val), *((long*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderSeqCst, MemoryScope) {
+  __int64 return_val = _InterlockedCompareExchange64(
+      (__int64*)dest, *((__int64*)&val), *((__int64*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 16, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderSeqCst, MemoryScope) {
+  Dummy16ByteValue* val16 = reinterpret_cast<Dummy16ByteValue*>(&val);
+  (void)_InterlockedCompareExchange128(reinterpret_cast<__int64*>(dest),
+                                       val16->value2,
+                                       val16->value1,
+                                       (reinterpret_cast<__int64*>(&compare)));
+  return compare;
+}
+
+
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<(sizeof(T) != 1 && sizeof(T) != 4 && sizeof(T) != 8 && sizeof(T) != 16), T>::type atomic_compare_exchange(
+     T* const dest, T compare, T val, MemoryOrder, MemoryScope scope) {
+  while (!Impl::lock_address((void*)dest, scope)) {}
+  if (std::is_same<MemoryOrder, MemoryOrderSeqCst>::value)
+          atomic_thread_fence(MemoryOrderRelease(), scope);
+  atomic_thread_fence(MemoryOrderAcquire(),scope);
+  T return_val = *dest;
+  if(return_val == compare) {
+    *dest = val;
+    atomic_thread_fence(MemoryOrderRelease(),scope);
+  }
+
+  Impl::unlock_address((void*)dest, scope);
+  return return_val;
+}
+
+}  // namespace desul
+
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_OpenMP.hpp b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_OpenMP.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a1d1c9124991d01640ca70243e9033e4c528e6cf
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_OpenMP.hpp
@@ -0,0 +1,145 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_OPENMP_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_OPENMP_HPP_
+#include "desul/atomics/Common.hpp"
+#include <cstdio>
+#include <omp.h>
+
+namespace desul
+{
+namespace Impl
+{
+static constexpr bool omp_on_host() { return true; }
+
+#pragma omp begin declare variant match(device = {kind(host)})
+static constexpr bool omp_on_host() { return true; }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {kind(nohost)})
+static constexpr bool omp_on_host() { return false; }
+#pragma omp end declare variant
+} // namespace Impl
+} // namespace desul
+
+#ifdef DESUL_HAVE_OPENMP_ATOMICS
+namespace desul {
+
+#if _OPENMP > 201800
+// atomic_thread_fence for Core Scope
+inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeCore) {
+  // There is no seq_cst flush in OpenMP, isn't it the same anyway for fence?
+  #pragma omp flush acq_rel
+}
+inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeCore) {
+  #pragma omp flush acq_rel
+}
+inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeCore) {
+  #pragma omp flush release
+}
+inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeCore) {
+  #pragma omp flush acquire
+}
+// atomic_thread_fence for Device Scope
+inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeDevice) {
+  // There is no seq_cst flush in OpenMP, isn't it the same anyway for fence?
+  #pragma omp flush acq_rel
+}
+inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeDevice) {
+  #pragma omp flush acq_rel
+}
+inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeDevice) {
+  #pragma omp flush release
+}
+inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeDevice) {
+  #pragma omp flush acquire
+}
+#else
+// atomic_thread_fence for Core Scope
+inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeCore) {
+  #pragma omp flush
+}
+inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeCore) {
+  #pragma omp flush
+}
+inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeCore) {
+  #pragma omp flush
+}
+inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeCore) {
+  #pragma omp flush
+}
+// atomic_thread_fence for Device Scope
+inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeDevice) {
+  #pragma omp flush
+}
+inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeDevice) {
+  #pragma omp flush
+}
+inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeDevice) {
+  #pragma omp flush
+}
+inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeDevice) {
+  #pragma omp flush
+}
+#endif
+
+template <typename T, class MemoryOrder, class MemoryScope>
+T atomic_exchange(
+    T* dest, T value, MemoryOrder, MemoryScope) {
+  T return_val;
+  if(!std::is_same<MemoryOrder,MemoryOrderRelaxed>::value)
+    atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T& x = *dest;
+  #pragma omp atomic capture
+  { return_val = x; x = value; }
+  if(!std::is_same<MemoryOrder,MemoryOrderRelaxed>::value)
+    atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+
+// OpenMP doesn't have compare exchange, so we use build-ins and rely on testing that this works
+// Note that means we test this in OpenMPTarget offload regions!
+template <typename T, class MemoryOrder, class MemoryScope>
+std::enable_if_t<Impl::atomic_always_lock_free(sizeof(T)),T> atomic_compare_exchange(
+    T* dest, T compare, T value, MemoryOrder, MemoryScope) {
+  using cas_t = typename Impl::atomic_compare_exchange_type<sizeof(T)>::type;
+  cas_t retval = __sync_val_compare_and_swap(
+     reinterpret_cast<volatile cas_t*>(dest), 
+     reinterpret_cast<cas_t&>(compare), 
+     reinterpret_cast<cas_t&>(value));
+  return reinterpret_cast<T&>(retval);
+}
+// Make 16 byte cas work on host at least (is_initial_device check, note this requires C++17)
+#if __cplusplus>=201703L
+
+#if defined(__clang__) && (__clang_major__>=7)
+// Disable warning for large atomics on clang 7 and up (checked with godbolt)
+// error: large atomic operation may incur significant performance penalty [-Werror,-Watomic-alignment]
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Watomic-alignment"
+#endif
+
+template <typename T, class MemoryOrder, class MemoryScope>
+std::enable_if_t<!Impl::atomic_always_lock_free(sizeof(T)) && (sizeof(T)==16),T> atomic_compare_exchange(
+    T* dest, T compare, T value, MemoryOrder, MemoryScope) {
+  if constexpr (desul::Impl::omp_on_host()) {
+    (void)__atomic_compare_exchange(
+      dest, &compare, &value, false, GCCMemoryOrder<MemoryOrder>::value, GCCMemoryOrder<MemoryOrder>::value);
+    return compare;
+  } else {
+    return value;
+  }
+}
+#if defined(__clang__) && (__clang_major__>=7)
+#pragma GCC diagnostic pop
+#endif
+#endif
+
+}  // namespace desul
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_SYCL.hpp b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_SYCL.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a8fd2ebbe2beef39e4cd8dff5797b722e8d17582
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_SYCL.hpp
@@ -0,0 +1,102 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_SYCL_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_SYCL_HPP_
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/SYCLConversions.hpp"
+#include <CL/sycl.hpp>
+
+
+#ifdef DESUL_HAVE_SYCL_ATOMICS
+
+namespace desul {
+
+template<class MemoryOrder, class MemoryScope>
+inline void atomic_thread_fence(MemoryOrder, MemoryScope) {
+  DESUL_SYCL_NAMESPACE::atomic_fence(DesulToSYCLMemoryOrder<MemoryOrder>::value,
+                                     DesulToSYCLMemoryScope<MemoryScope>::value);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrder, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4, "this function assumes an unsigned int is 32-bit");
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    unsigned int, 
+    DesulToSYCLMemoryOrder<MemoryOrder>::value, 
+    DesulToSYCLMemoryScope<MemoryScope>::value, 
+    sycl::access::address_space::global_device_space> 
+  dest_ref(*reinterpret_cast<unsigned int*>(dest));
+  dest_ref.compare_exchange_strong(*reinterpret_cast<unsigned int*>(&compare), 
+                                   *reinterpret_cast<unsigned int*>(&value));
+  return compare;
+}
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrder, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8, "this function assumes an unsigned long long  is 64-bit");
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    unsigned long long int, 
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScope>::value, 
+    sycl::access::address_space::global_device_space> 
+  dest_ref(*reinterpret_cast<unsigned long long int*>(dest));
+  dest_ref.compare_exchange_strong(*reinterpret_cast<unsigned long long int*>(&compare),
+                                   *reinterpret_cast<unsigned long long int*>(&value));
+  return compare;
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrder, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4, "this function assumes an unsigned int is 32-bit");
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    unsigned int, 
+    DesulToSYCLMemoryOrder<MemoryOrder>::value, 
+    DesulToSYCLMemoryScope<MemoryScope>::value,  
+    sycl::access::address_space::global_device_space> 
+  dest_ref(*reinterpret_cast<unsigned int*>(dest));
+  unsigned int return_val = dest_ref.exchange(*reinterpret_cast<unsigned int*>(&value));
+  return reinterpret_cast<T&>(return_val);
+}
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrder, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8, "this function assumes an unsigned long long  is 64-bit");
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    unsigned long long int,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScope>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*reinterpret_cast<unsigned long long int*>(dest));
+  unsigned long long int return_val =
+      dest_ref.exchange(reinterpret_cast<unsigned long long int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type atomic_compare_exchange(
+    T* const /*dest*/, T compare, T /*value*/, MemoryOrder, MemoryScope) {
+  // FIXME_SYCL not implemented
+  assert(false);
+  return compare;  
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type atomic_exchange(
+    T* const /*dest*/, T value, MemoryOrder, MemoryScope) {
+  // FIXME_SYCL not implemented
+  assert(false);
+  return value;
+}
+
+}
+
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Compare_Exchange_Serial.hpp b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_Serial.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..be7b46d5fa0540f20abfb8903f20a3e2f7d80e5a
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Compare_Exchange_Serial.hpp
@@ -0,0 +1,45 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_SERIAL_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_SERIAL_HPP_
+
+#ifdef DESUL_HAVE_SERIAL_ATOMICS
+namespace desul {
+template<class MemoryScope>
+void atomic_thread_fence(MemoryOrderAcquire, MemoryScope) {
+}
+
+template<class MemoryScope>
+void atomic_thread_fence(MemoryOrderRelease, MemoryScope) {
+}
+
+template <typename T, class MemoryScope>
+T atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) {
+  T old = *dest;
+  if (old == compare) {
+    *dest = value;
+  } else {
+    old = compare;
+  }
+  return compare;
+}
+template <typename T, class MemoryScope>
+T atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) {
+  T old = *dest;
+  if (old == compare) {
+    *dest = value;
+  } else {
+    old = compare;
+  }
+  return compare;
+}
+}  // namespace desul
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/GCC.hpp b/packages/kokkos/core/src/desul/atomics/GCC.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cd0c2bea1180662969e0af8abee5d23a1b7334ca
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/GCC.hpp
@@ -0,0 +1,131 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_GCC_HPP_
+#define DESUL_ATOMICS_GCC_HPP_
+
+#ifdef DESUL_HAVE_GCC_ATOMICS
+
+#include<type_traits>
+/*
+Built - in Function : type __atomic_add_fetch(type * ptr, type val, int memorder)
+Built - in Function : type __atomic_sub_fetch(type * ptr, type val, int memorder)
+Built - in Function : type __atomic_and_fetch(type * ptr, type val, int memorder)
+Built - in Function : type __atomic_xor_fetch(type * ptr, type val, int memorder)
+Built - in Function : type __atomic_or_fetch(type * ptr, type val, int memorder)
+Built - in Function : type __atomic_nand_fetch(type * ptr, type val, int memorder)
+*/
+
+#define DESUL_GCC_INTEGRAL_OP_ATOMICS(MEMORY_ORDER, MEMORY_SCOPE)                 \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_fetch_add(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_fetch_add(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_fetch_sub(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_fetch_sub(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_fetch_and(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_fetch_and(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_fetch_or(   \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_fetch_or(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);   \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_fetch_xor(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_fetch_xor(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_fetch_nand( \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_fetch_nand(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value); \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_add_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_add_fetch(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_sub_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_sub_fetch(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_and_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_and_fetch(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_or_fetch(   \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_or_fetch(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);   \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_xor_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_xor_fetch(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_nand_fetch( \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_nand_fetch(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value); \
+  }
+
+namespace desul {
+DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderRelaxed, MemoryScopeNode)
+DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderRelaxed, MemoryScopeDevice)
+DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderRelaxed, MemoryScopeCore)
+DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderSeqCst, MemoryScopeNode)
+DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderSeqCst, MemoryScopeDevice)
+DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderSeqCst, MemoryScopeCore)
+
+template <typename T, class MemoryOrder, class MemoryScope>
+std::enable_if_t<!Impl::atomic_exchange_available_gcc<T>::value, T>
+atomic_exchange(T* const dest,
+                  Impl::dont_deduce_this_parameter_t<const T> val,
+                  MemoryOrder /*order*/,
+                  MemoryScope scope) {
+  // Acquire a lock for the address
+  while (!Impl::lock_address((void*)dest, scope)) {}
+
+  atomic_thread_fence(MemoryOrderAcquire(),scope);
+  T return_val = *dest;
+  *dest = val;
+  atomic_thread_fence(MemoryOrderRelease(),scope);
+  Impl::unlock_address((void*)dest, scope);
+  return return_val;
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+std::enable_if_t<!Impl::atomic_exchange_available_gcc<T>::value, T>
+atomic_compare_exchange(T* const dest,
+                  Impl::dont_deduce_this_parameter_t<const T> compare,
+                  Impl::dont_deduce_this_parameter_t<const T> val,
+                  MemoryOrder /*order*/,
+                  MemoryScope scope) {
+  // Acquire a lock for the address
+  while (!Impl::lock_address((void*)dest, scope)) {}
+
+  atomic_thread_fence(MemoryOrderAcquire(),scope);
+  T return_val = *dest;
+  if(return_val == compare) {
+    *dest = val;
+    atomic_thread_fence(MemoryOrderRelease(),scope);
+  }
+  Impl::unlock_address((void*)dest, scope);
+  return return_val;
+}
+}  // namespace desul
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Generic.hpp b/packages/kokkos/core/src/desul/atomics/Generic.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9d5e87ece29f2c444522a91e4635598872f5b71f
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Generic.hpp
@@ -0,0 +1,690 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_GENERIC_HPP_
+#define DESUL_ATOMICS_GENERIC_HPP_
+
+#include <type_traits>
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/Compare_Exchange.hpp"
+#include "desul/atomics/Lock_Array.hpp"
+#include "desul/atomics/Macros.hpp"
+// Combination operands to be used in an Compare and Exchange based atomic
+// operation
+namespace desul {
+namespace Impl {
+
+template <class Scalar1, class Scalar2>
+struct MaxOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return (val1 > val2 ? val1 : val2);
+  }
+  DESUL_FORCEINLINE_FUNCTION
+  static constexpr bool check_early_exit(Scalar1 const& val1, Scalar2 const& val2) {
+    return val1 > val2;
+  }
+};
+
+template <class Scalar1, class Scalar2>
+struct MinOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return (val1 < val2 ? val1 : val2);
+  }
+  DESUL_FORCEINLINE_FUNCTION
+  static constexpr bool check_early_exit(Scalar1 const& val1, Scalar2 const& val2) {
+    return val1 < val2;
+  }
+};
+
+// This exit early optimization causes weird compiler errors with MSVC 2019
+#ifndef DESUL_HAVE_MSVC_ATOMICS
+template <typename Op, typename Scalar1, typename Scalar2, typename = bool>
+struct may_exit_early : std::false_type {};
+
+template <typename Op, typename Scalar1, typename Scalar2>
+struct may_exit_early<Op,
+                      Scalar1,
+                      Scalar2,
+                      decltype(Op::check_early_exit(std::declval<Scalar1 const&>(),
+                                                    std::declval<Scalar2 const&>()))>
+    : std::true_type {};
+
+template <typename Op, typename Scalar1, typename Scalar2>
+constexpr DESUL_FUNCTION typename std::enable_if<may_exit_early<Op, Scalar1, Scalar2>::value, bool>::type
+check_early_exit(Op const&, Scalar1 const& val1, Scalar2 const& val2) {
+  return Op::check_early_exit(val1, val2);
+}
+
+template <typename Op, typename Scalar1, typename Scalar2>
+constexpr DESUL_FUNCTION typename std::enable_if<!may_exit_early<Op, Scalar1, Scalar2>::value, bool>::type
+check_early_exit(Op const&, Scalar1 const&, Scalar2 const&) {
+  return false;
+}
+#endif
+
+template <class Scalar1, class Scalar2>
+struct AddOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 + val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct SubOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 - val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct MulOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 * val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct DivOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 / val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct ModOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 % val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct AndOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 & val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct OrOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 | val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct XorOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 ^ val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct NandOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return ~(val1 & val2);
+  }
+};
+
+template <class Scalar1, class Scalar2>
+struct LShiftOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1 << val2;
+  }
+};
+
+template <class Scalar1, class Scalar2>
+struct RShiftOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1 >> val2;
+  }
+};
+
+template <class Scalar1, class Scalar2>
+struct StoreOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1&, const Scalar2& val2) { return val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct LoadOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2&) { return val1; }
+};
+
+
+template <class Oper, typename T, class MemoryOrder, class MemoryScope,
+  // equivalent to:
+  //   requires atomic_always_lock_free(sizeof(T))
+  std::enable_if_t<atomic_always_lock_free(sizeof(T)), int> = 0
+>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_oper(const Oper& op,
+                  T* const dest,
+                  dont_deduce_this_parameter_t<const T> val,
+                  MemoryOrder order,
+                  MemoryScope scope) {
+  using cas_t = typename atomic_compare_exchange_type<sizeof(T)>::type;
+  cas_t oldval = reinterpret_cast<cas_t&>(*dest);
+  cas_t assume = oldval;
+
+  do {
+#ifndef DESUL_HAVE_MSVC_ATOMICS
+    if (Impl::check_early_exit(op, reinterpret_cast<T&>(oldval), val)) return reinterpret_cast<T&>(oldval);
+#endif
+    assume = oldval;
+    T newval = op.apply(reinterpret_cast<T&>(assume), val);
+    oldval = desul::atomic_compare_exchange(
+        reinterpret_cast<cas_t*>(dest), assume, reinterpret_cast<cas_t&>(newval), order, scope);
+  } while (assume != oldval);
+
+  return reinterpret_cast<T&>(oldval);
+}
+
+template <class Oper, typename T, class MemoryOrder, class MemoryScope,
+  // equivalent to:
+  //   requires atomic_always_lock_free(sizeof(T))
+  std::enable_if_t<atomic_always_lock_free(sizeof(T)), int> = 0
+>
+DESUL_INLINE_FUNCTION T
+atomic_oper_fetch(const Oper& op,
+                  T* const dest,
+                  dont_deduce_this_parameter_t<const T> val,
+                  MemoryOrder order,
+                  MemoryScope scope) {
+  using cas_t = typename atomic_compare_exchange_type<sizeof(T)>::type;
+  cas_t oldval = reinterpret_cast<cas_t&>(*dest);
+  T newval = val;
+  cas_t assume = oldval;
+  do {
+#ifndef DESUL_HAVE_MSVC_ATOMICS
+    if (Impl::check_early_exit(op, reinterpret_cast<T&>(oldval), val)) return reinterpret_cast<T&>(oldval);
+#endif
+    assume = oldval;
+    newval = op.apply(reinterpret_cast<T&>(assume), val);
+    oldval = desul::atomic_compare_exchange(
+        reinterpret_cast<cas_t*>(dest), assume, reinterpret_cast<cas_t&>(newval), order, scope);
+  } while (assume != oldval);
+
+  return newval;
+}
+
+template <class Oper, typename T, class MemoryOrder, class MemoryScope,
+  // equivalent to:
+  //   requires !atomic_always_lock_free(sizeof(T))
+  std::enable_if_t<!atomic_always_lock_free(sizeof(T)), int> = 0
+>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_oper(const Oper& op,
+                  T* const dest,
+                  dont_deduce_this_parameter_t<const T> val,
+                  MemoryOrder /*order*/,
+                  MemoryScope scope) {
+#if defined(DESUL_HAVE_FORWARD_PROGRESS)
+  // Acquire a lock for the address
+  while (!Impl::lock_address((void*)dest, scope)) {}
+
+  atomic_thread_fence(MemoryOrderAcquire(),scope);
+  T return_val = *dest;
+  *dest = op.apply(return_val, val);
+  atomic_thread_fence(MemoryOrderRelease(),scope);
+  Impl::unlock_address((void*)dest, scope);
+  return return_val;
+#elif defined(DESUL_HAVE_GPU_LIKE_PROGRESS)
+  // This is a way to avoid dead lock in a warp or wave front
+  T return_val;
+  int done = 0;
+#ifdef __HIPCC__
+  unsigned long long int active = DESUL_IMPL_BALLOT_MASK(1);
+  unsigned long long int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_hip((void*)dest, scope)) {
+        atomic_thread_fence(MemoryOrderAcquire(), scope);
+        return_val = *dest;
+        *dest = op.apply(return_val, val);
+        atomic_thread_fence(MemoryOrderRelease(), scope);
+        Impl::unlock_address_hip((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(done);
+  }
+  return return_val;
+// FIXME_SYCL not implemented
+#elif defined(__SYCL_DEVICE_ONLY__)
+  (void) op;
+  (void) dest;
+  (void) scope;
+  (void) return_val;
+  (void) done;
+
+  assert(false);
+  return val;
+#else
+  unsigned int mask = DESUL_IMPL_ACTIVEMASK;
+  unsigned int active = DESUL_IMPL_BALLOT_MASK(mask, 1);
+  unsigned int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_cuda((void*)dest, scope)) {
+        atomic_thread_fence(MemoryOrderAcquire(),scope);
+        return_val = *dest;
+        *dest = op.apply(return_val, val);
+        atomic_thread_fence(MemoryOrderRelease(),scope);
+        Impl::unlock_address_cuda((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(mask, done);
+  }
+  return return_val;
+#endif
+#else
+  static_assert(false, "Unimplemented lock based attomic\n");
+  return val;
+#endif
+}
+
+template <class Oper, typename T, class MemoryOrder, class MemoryScope,
+  // equivalent to:
+  //   requires !atomic_always_lock_free(sizeof(T))
+  std::enable_if_t<!atomic_always_lock_free(sizeof(T)), int> = 0
+>
+DESUL_INLINE_FUNCTION T
+atomic_oper_fetch(const Oper& op,
+                  T* const dest,
+                  dont_deduce_this_parameter_t<const T> val,
+                  MemoryOrder /*order*/,
+                  MemoryScope scope) {
+#if defined(DESUL_HAVE_FORWARD_PROGRESS)
+  // Acquire a lock for the address
+  while (!Impl::lock_address((void*)dest, scope)) {}
+
+  atomic_thread_fence(MemoryOrderAcquire(),scope);
+  T return_val = op.apply(*dest, val);
+  *dest = return_val;
+  atomic_thread_fence(MemoryOrderRelease(),scope);
+  Impl::unlock_address((void*)dest, scope);
+  return return_val;
+#elif defined(DESUL_HAVE_GPU_LIKE_PROGRESS)
+  // This is a way to avoid dead lock in a warp or wave front
+  T return_val;
+  int done = 0;
+#ifdef __HIPCC__
+  unsigned long long int active = DESUL_IMPL_BALLOT_MASK(1);
+  unsigned long long int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_hip((void*)dest, scope)) {
+        atomic_thread_fence(MemoryOrderAcquire(), scope);
+        return_val = op.apply(*dest, val);
+        *dest = return_val;
+        atomic_thread_fence(MemoryOrderRelease(), scope);
+        Impl::unlock_address_hip((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(done);
+  }
+  return return_val;
+  // FIXME_SYCL not implemented
+#elif defined(__SYCL_DEVICE_ONLY__)
+  (void) op;
+  (void) dest;
+  (void) scope;
+  (void) done;
+
+  assert(false);
+  return val;
+#else
+  unsigned int mask = DESUL_IMPL_ACTIVEMASK;
+  unsigned int active = DESUL_IMPL_BALLOT_MASK(mask, 1);
+  unsigned int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_cuda((void*)dest, scope)) {
+        atomic_thread_fence(MemoryOrderAcquire(),scope);
+        return_val = op.apply(*dest, val);
+        *dest = return_val;
+        atomic_thread_fence(MemoryOrderRelease(),scope);
+        Impl::unlock_address_cuda((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(mask, done);
+  }
+  return return_val;
+#endif
+#else
+  static_assert(false, "Unimplemented lock based atomic\n");
+  return val;
+#endif
+}
+
+}  // namespace Impl
+}  // namespace desul
+
+namespace desul {
+
+// Fetch_Oper atomics: return value before operation
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_add(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::AddOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_sub(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::SubOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_max(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::MaxOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_min(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::MinOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_mul(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::MulOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_div(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::DivOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_mod(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::ModOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_and(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::AndOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_or(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::OrOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_xor(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::XorOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_nand(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::NandOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_fetch_lshift(T* const dest,
+                                            const unsigned int val,
+                                            MemoryOrder order,
+                                            MemoryScope scope) {
+  return Impl::atomic_fetch_oper(
+      Impl::LShiftOper<T, const unsigned int>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_fetch_rshift(T* const dest,
+                                            const unsigned int val,
+                                            MemoryOrder order,
+                                            MemoryScope scope) {
+  return Impl::atomic_fetch_oper(
+      Impl::RShiftOper<T, const unsigned int>(), dest, val, order, scope);
+}
+
+// Oper Fetch atomics: return value after operation
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_add_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::AddOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_sub_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::SubOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_max_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::MaxOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_min_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::MinOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_mul_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::MulOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_div_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::DivOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_mod_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::ModOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_and_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::AndOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_or_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::OrOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_xor_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::XorOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_nand_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::NandOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_lshift_fetch(T* const dest,
+                                            const unsigned int val,
+                                            MemoryOrder order,
+                                            MemoryScope scope) {
+  return Impl::atomic_oper_fetch(
+      Impl::LShiftOper<T, const unsigned int>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_rshift_fetch(T* const dest,
+                                            const unsigned int val,
+                                            MemoryOrder order,
+                                            MemoryScope scope) {
+  return Impl::atomic_oper_fetch(
+      Impl::RShiftOper<T, const unsigned int>(), dest, val, order, scope);
+}
+
+// Other atomics
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_load(const T* const dest,
+                                    MemoryOrder order,
+                                    MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::LoadOper<T, const T>(), const_cast<T*>(dest), T(), order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_store(T* const dest,
+                                        const T val,
+                                        MemoryOrder order,
+                                        MemoryScope scope) {
+  (void)Impl::atomic_fetch_oper(Impl::StoreOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_add(T* const dest,
+                                      const T val,
+                                      MemoryOrder order,
+                                      MemoryScope scope) {
+  (void)atomic_fetch_add(dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_sub(T* const dest,
+                                      const T val,
+                                      MemoryOrder order,
+                                      MemoryScope scope) {
+  (void)atomic_fetch_sub(dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_mul(T* const dest,
+                                      const T val,
+                                      MemoryOrder order,
+                                      MemoryScope scope) {
+  (void)atomic_fetch_mul(dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_div(T* const dest,
+                                      const T val,
+                                      MemoryOrder order,
+                                      MemoryScope scope) {
+  (void)atomic_fetch_div(dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_min(T* const dest,
+                                      const T val,
+                                      MemoryOrder order,
+                                      MemoryScope scope) {
+  (void)atomic_fetch_min(dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_max(T* const dest,
+                                      const T val,
+                                      MemoryOrder order,
+                                      MemoryScope scope) {
+  (void)atomic_fetch_max(dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_inc_fetch(T* const dest, MemoryOrder order, MemoryScope scope) {
+  return atomic_add_fetch(dest, T(1), order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_dec_fetch(T* const dest, MemoryOrder order, MemoryScope scope) {
+  return atomic_sub_fetch(dest, T(1), order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_fetch_inc(T* const dest,
+                                         MemoryOrder order,
+                                         MemoryScope scope) {
+  return atomic_fetch_add(dest, T(1), order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_fetch_dec(T* const dest,
+                                         MemoryOrder order,
+                                         MemoryScope scope) {
+  return atomic_fetch_sub(dest, T(1), order, scope);
+}
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_inc(T* const dest,
+                                         MemoryOrder order,
+                                         MemoryScope scope) {
+  return atomic_add(dest, T(1), order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_dec(T* const dest,
+                                         MemoryOrder order,
+                                         MemoryScope scope) {
+  return atomic_sub(dest, T(1), order, scope);
+}
+
+// FIXME
+template <typename T,
+          class SuccessMemoryOrder,
+          class FailureMemoryOrder,
+          class MemoryScope>
+DESUL_INLINE_FUNCTION bool atomic_compare_exchange_strong(
+    T* const dest,
+    T& expected,
+    T desired,
+    SuccessMemoryOrder success,
+    FailureMemoryOrder /*failure*/,
+    MemoryScope scope) {
+  T const old = atomic_compare_exchange(dest, expected, desired, success, scope);
+  if (old != expected) {
+    expected = old;
+    return false;
+  } else {
+    return true;
+  }
+}
+
+template <typename T,
+          class SuccessMemoryOrder,
+          class FailureMemoryOrder,
+          class MemoryScope>
+DESUL_INLINE_FUNCTION bool atomic_compare_exchange_weak(T* const dest,
+                                                        T& expected,
+                                                        T desired,
+                                                        SuccessMemoryOrder success,
+                                                        FailureMemoryOrder failure,
+                                                        MemoryScope scope) {
+  return atomic_compare_exchange_strong(
+      dest, expected, desired, success, failure, scope);
+}
+
+}  // namespace desul
+
+#include <desul/atomics/SYCL.hpp>
+#include <desul/atomics/CUDA.hpp>
+#include <desul/atomics/GCC.hpp>
+#include <desul/atomics/HIP.hpp>
+#include <desul/atomics/OpenMP.hpp>
+#pragma GCC diagnostic pop
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/HIP.hpp b/packages/kokkos/core/src/desul/atomics/HIP.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..16c1f510b7a2627408ccea374004d280997e96df
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/HIP.hpp
@@ -0,0 +1,338 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_HIP_HPP_
+#define DESUL_ATOMICS_HIP_HPP_
+
+#ifdef __HIP_DEVICE_COMPILE__
+namespace desul {
+namespace Impl {
+template <typename T>
+struct is_hip_atomic_integer_type {
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long int>::value;
+};
+
+template <typename T>
+struct is_hip_atomic_add_type {
+  static constexpr bool value = is_hip_atomic_integer_type<T>::value ||
+                                std::is_same<T, double>::value ||
+                                std::is_same<T, float>::value;
+};
+
+template <typename T>
+struct is_hip_atomic_sub_type {
+  static constexpr bool value =
+      std::is_same<T, int>::value || std::is_same<T, unsigned int>::value;
+};
+}  // namespace Impl
+
+// Atomic Add
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_add_type<T>::value, T>::type
+    atomic_fetch_add(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicAdd(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_add_type<T>::value, T>::type
+    atomic_fetch_add(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicAdd(dest, val);
+  __threadfence();
+
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_add_type<T>::value, T>::type
+    atomic_fetch_add(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_add(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Sub
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_sub_type<T>::value, T>::type
+    atomic_fetch_sub(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicSub(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_sub_type<T>::value, T>::type
+    atomic_fetch_sub(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicSub(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_sub_type<T>::value, T>::type
+    atomic_fetch_sub(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_sub(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Inc
+__device__ inline unsigned int atomic_fetch_inc(unsigned int* dest,
+                                                unsigned int val,
+                                                MemoryOrderRelaxed,
+                                                MemoryScopeDevice) {
+  return atomicInc(dest, val);
+}
+
+template <typename MemoryOrder>
+__device__ inline unsigned int atomic_fetch_inc(unsigned int* dest,
+                                                unsigned int val,
+                                                MemoryOrder,
+                                                MemoryScopeDevice) {
+  __threadfence();
+  unsigned int return_val = atomicInc(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename MemoryOrder>
+__device__ inline unsigned int atomic_fetch_inc(unsigned int* dest,
+                                                unsigned int val,
+                                                MemoryOrder,
+                                                MemoryScopeCore) {
+  return atomic_fetch_inc(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Dec
+__device__ inline unsigned int atomic_fetch_dec(unsigned int* dest,
+                                                unsigned int val,
+                                                MemoryOrderRelaxed,
+                                                MemoryScopeDevice) {
+  return atomicDec(dest, val);
+}
+
+template <typename MemoryOrder>
+__device__ inline unsigned int atomic_fetch_dec(unsigned int* dest,
+                                                unsigned int val,
+                                                MemoryOrder,
+                                                MemoryScopeDevice) {
+  __threadfence();
+  unsigned int return_val = atomicDec(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename MemoryOrder>
+__device__ inline unsigned int atomic_fetch_dec(unsigned int* dest,
+                                                unsigned int val,
+                                                MemoryOrder,
+                                                MemoryScopeCore) {
+  return atomic_fetch_dec(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Max
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_max(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicMax(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_max(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicMax(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_max(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_max(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Min
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_min(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicMin(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_min(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicMin(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_min(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_min(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic And
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_and(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicAnd(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_and(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicAnd(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_and(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_and(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic XOR
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_xor(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicXor(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_xor(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicXor(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_xor(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_xor(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic OR
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_or(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicOr(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_or(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicOr(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_or(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_or(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+}
+
+#define DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MEMORY_ORDER, MEMORY_SCOPE)                 \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_add_type<T>::value, T>::type atomic_fetch_add(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_fetch_oper(Impl::AddOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_sub_type<T>::value, T>::type atomic_fetch_sub(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_fetch_oper(Impl::SubOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_fetch_and(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_fetch_oper(Impl::AndOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_fetch_or(   \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_fetch_oper(Impl::OrOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_fetch_xor(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_fetch_oper(Impl::XorOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_fetch_nand( \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_fetch_oper(Impl::NandOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_add_type<T>::value, T>::type atomic_add_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_oper_fetch(Impl::AddOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_sub_type<T>::value, T>::type atomic_sub_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_oper_fetch(Impl::SubOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_and_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_oper_fetch(Impl::AndOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_or_fetch(   \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_oper_fetch(Impl::OrOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_xor_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_oper_fetch(Impl::XorOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_nand_fetch( \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_oper_fetch(Impl::NandOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }
+namespace desul {
+DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MemoryOrderRelaxed, MemoryScopeNode)
+DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MemoryOrderRelaxed, MemoryScopeDevice)
+DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MemoryOrderRelaxed, MemoryScopeCore)
+DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MemoryOrderSeqCst, MemoryScopeNode)
+DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MemoryOrderSeqCst, MemoryScopeDevice)
+DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MemoryOrderSeqCst, MemoryScopeCore)
+}  // namespace desul
+
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Lock_Array.hpp b/packages/kokkos/core/src/desul/atomics/Lock_Array.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8fd0e8bbd7718a1097d898b01b20aa71ff515f2f
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Lock_Array.hpp
@@ -0,0 +1,75 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_LOCK_ARRAY_HPP_
+#define DESUL_ATOMICS_LOCK_ARRAY_HPP_
+
+#include "desul/atomics/Compare_Exchange.hpp"
+#include "desul/atomics/Lock_Array_Cuda.hpp"
+#include "desul/atomics/Lock_Array_HIP.hpp"
+#include "desul/atomics/Macros.hpp"
+
+namespace desul {
+namespace Impl {
+struct host_locks__ {
+  static constexpr uint32_t HOST_SPACE_ATOMIC_MASK = 0xFFFF;
+  static constexpr uint32_t HOST_SPACE_ATOMIC_XOR_MASK = 0x5A39;
+  template <typename is_always_void = void>
+  static int32_t* get_host_locks_() {
+    static int32_t HOST_SPACE_ATOMIC_LOCKS_DEVICE[HOST_SPACE_ATOMIC_MASK + 1] = {0};
+    return HOST_SPACE_ATOMIC_LOCKS_DEVICE;
+  }
+  static inline int32_t* get_host_lock_(void* ptr) {
+    return &get_host_locks_()[((uint64_t(ptr) >> 2) & HOST_SPACE_ATOMIC_MASK) ^
+                              HOST_SPACE_ATOMIC_XOR_MASK];
+  }
+};
+
+inline void init_lock_arrays() {
+  static bool is_initialized = false;
+  if (!is_initialized) {
+    host_locks__::get_host_locks_();
+    is_initialized = true;
+  }
+
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+  init_lock_arrays_cuda();
+#endif
+
+#ifdef DESUL_HAVE_HIP_ATOMICS
+  init_lock_arrays_hip();
+#endif
+}
+
+inline void finalize_lock_arrays() {
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+  finalize_lock_arrays_cuda();
+#endif
+
+#ifdef DESUL_HAVE_HIP_ATOMICS
+  finalize_lock_arrays_hip();
+#endif
+}
+template <typename MemoryScope>
+inline bool lock_address(void* ptr, MemoryScope ms) {
+  return 0 == atomic_exchange(host_locks__::get_host_lock_(ptr),
+                                      int32_t(1),
+                                      MemoryOrderSeqCst(),
+                                      ms);
+}
+template <typename MemoryScope>
+void unlock_address(void* ptr, MemoryScope ms) {
+  (void)atomic_exchange(host_locks__::get_host_lock_(ptr),
+                                int32_t(0),
+                                MemoryOrderSeqCst(),
+                                ms);
+}
+}  // namespace Impl
+}  // namespace desul
+
+#endif  // DESUL_ATOMICS_LOCK_ARRAY_HPP_
diff --git a/packages/kokkos/core/src/desul/atomics/Lock_Array_Cuda.hpp b/packages/kokkos/core/src/desul/atomics/Lock_Array_Cuda.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..de99185349043dc6e0f13c7e57c14dbc080deb9e
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Lock_Array_Cuda.hpp
@@ -0,0 +1,172 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_
+#define DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_
+
+#include "desul/atomics/Macros.hpp"
+#include "desul/atomics/Common.hpp"
+
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+
+#include <cstdint>
+
+namespace desul {
+namespace Impl {
+
+#ifdef __CUDA_ARCH__
+#define DESUL_IMPL_BALLOT_MASK(m, x) __ballot_sync(m, x)
+#define DESUL_IMPL_ACTIVEMASK __activemask()
+#else
+#define DESUL_IMPL_BALLOT_MASK(m, x) m==0?0:1
+#define DESUL_IMPL_ACTIVEMASK 0
+#endif
+
+/// \brief This global variable in Host space is the central definition
+///        of these arrays.
+extern int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h;
+extern int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE_h;
+
+
+/// \brief After this call, the g_host_cuda_lock_arrays variable has
+///        valid, initialized arrays.
+///
+/// This call is idempotent.
+/// The function is templated to make it a weak symbol to deal with Kokkos/RAJA
+///   snapshotted version while also linking against pure Desul
+template<typename /*AlwaysInt*/ = int>
+void init_lock_arrays_cuda();
+
+/// \brief After this call, the g_host_cuda_lock_arrays variable has
+///        all null pointers, and all array memory has been freed.
+///
+/// This call is idempotent.
+/// The function is templated to make it a weak symbol to deal with Kokkos/RAJA
+///   snappshotted version while also linking against pure Desul
+template<typename T = int>
+void finalize_lock_arrays_cuda();
+
+}  // namespace Impl
+}  // namespace desul
+
+#if defined(__CUDACC__)
+
+namespace desul {
+namespace Impl {
+
+/// \brief This global variable in CUDA space is what kernels use
+///        to get access to the lock arrays.
+///
+/// When relocatable device code is enabled, there can be one single
+/// instance of this global variable for the entire executable,
+/// whose definition will be in Kokkos_Cuda_Locks.cpp (and whose declaration
+/// here must then be extern.
+/// This one instance will be initialized by initialize_host_cuda_lock_arrays
+/// and need not be modified afterwards.
+///
+/// When relocatable device code is disabled, an instance of this variable
+/// will be created in every translation unit that sees this header file
+/// (we make this clear by marking it static, meaning no other translation
+///  unit can link to it).
+/// Since the Kokkos_Cuda_Locks.cpp translation unit cannot initialize the
+/// instances in other translation units, we must update this CUDA global
+/// variable based on the Host global variable prior to running any kernels
+/// that will use it.
+/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
+__device__
+#ifdef __CUDACC_RDC__
+    __constant__ extern
+#endif
+    int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE;
+
+__device__
+#ifdef __CUDACC_RDC__
+    __constant__ extern
+#endif
+    int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE;
+
+#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
+
+/// \brief Acquire a lock for the address
+///
+/// This function tries to acquire the lock for the hash value derived
+/// from the provided ptr. If the lock is successfully acquired the
+/// function returns true. Otherwise it returns false.
+__device__ inline bool lock_address_cuda(void* ptr, desul::MemoryScopeDevice) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  return (0 == atomicExch(&desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE[offset], 1));
+}
+__device__ inline bool lock_address_cuda(void* ptr, desul::MemoryScopeNode) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  return (0 == atomicExch(&desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE[offset], 1));
+}
+
+/// \brief Release lock for the address
+///
+/// This function releases the lock for the hash value derived
+/// from the provided ptr. This function should only be called
+/// after previously successfully acquiring a lock with
+/// lock_address.
+__device__ inline void unlock_address_cuda(void* ptr, desul::MemoryScopeDevice) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  atomicExch(&desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE[offset], 0);
+}
+__device__ inline void unlock_address_cuda(void* ptr, desul::MemoryScopeNode) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  atomicExch(&desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE[offset], 0);
+}
+
+}  // namespace Impl
+}  // namespace desul
+
+// Make lock_array_copied an explicit translation unit scope thingy
+namespace desul {
+namespace Impl {
+namespace {
+static int lock_array_copied = 0;
+inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
+}  // namespace
+}  // namespace Impl
+}  // namespace desul
+/* It is critical that this code be a macro, so that it will
+   capture the right address for desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE
+   putting this in an inline function will NOT do the right thing! */
+#define DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()                       \
+  {                                                                        \
+    if (::desul::Impl::lock_array_copied == 0) {                           \
+      cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE,    \
+                         &::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h, \
+                         sizeof(int32_t*));                                \
+      cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE,    \
+                         &::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE_h, \
+                         sizeof(int32_t*));                                \
+    }                                                                      \
+    ::desul::Impl::lock_array_copied = 1;                                  \
+  }
+
+
+#endif /* defined( __CUDACC__ ) */
+
+#endif /* defined( KOKKOS_ENABLE_CUDA ) */
+
+#if defined(__CUDACC_RDC__) || (!defined(__CUDACC__))
+#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
+#else
+#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
+  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
+#endif
+
+#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP_ */
diff --git a/packages/kokkos/core/src/desul/atomics/Lock_Array_HIP.hpp b/packages/kokkos/core/src/desul/atomics/Lock_Array_HIP.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9e6f5e59800b6778bf2c0592f0104526a730ac00
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Lock_Array_HIP.hpp
@@ -0,0 +1,170 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATORMICS_LOCK_ARRAY_HIP_HPP_
+#define DESUL_ATORMICS_LOCK_ARRAY_HIP_HPP_
+
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/Macros.hpp"
+
+#ifdef DESUL_HAVE_HIP_ATOMICS
+
+#include <hip/hip_runtime.h>
+
+#include <cstdint>
+
+namespace desul {
+namespace Impl {
+
+#ifdef __HIP_DEVICE_COMPILE__
+#define DESUL_IMPL_BALLOT_MASK(x) __ballot(x)
+#endif
+
+/**
+ * \brief This global variable in Host space is the central definition of these
+ * arrays.
+ */
+extern int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE_h;
+extern int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE_h;
+
+/// \brief After this call, the g_host_cuda_lock_arrays variable has
+///        valid, initialized arrays.
+///
+/// This call is idempotent.
+/// The function is templated to make it a weak symbol to deal with Kokkos/RAJA
+///   snappshotted version while also linking against pure Desul
+template<typename T = int>
+void init_lock_arrays_hip();
+
+/// \brief After this call, the g_host_cuda_lock_arrays variable has
+///        all null pointers, and all array memory has been freed.
+///
+/// This call is idempotent.
+/// The function is templated to make it a weak symbol to deal with Kokkos/RAJA
+///   snappshotted version while also linking against pure Desul
+template<typename T = int>
+void finalize_lock_arrays_hip();
+}  // namespace Impl
+}  // namespace desul
+
+#ifdef __HIPCC__
+namespace desul {
+namespace Impl {
+
+/**
+ * \brief This global variable in HIP space is what kernels use to get access
+ * to the lock arrays.
+ *
+ * When relocatable device code is enabled, there can be one single instance of
+ * this global variable for the entire executable, whose definition will be in
+ * Kokkos_HIP_Locks.cpp (and whose declaration here must then be extern.  This
+ * one instance will be initialized by initialize_host_hip_lock_arrays and need
+ * not be modified afterwards.
+ *
+ * When relocatable device code is disabled, an instance of this variable will
+ * be created in every translation unit that sees this header file (we make this
+ * clear by marking it static, meaning no other translation unit can link to
+ * it). Since the Kokkos_HIP_Locks.cpp translation unit cannot initialize the
+ * instances in other translation units, we must update this CUDA global
+ * variable based on the Host global variable prior to running any kernels that
+ * will use it.  That is the purpose of the
+ * KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE macro.
+ */
+__device__
+#ifdef DESUL_HIP_RDC
+    __constant__ extern
+#endif
+    int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE;
+
+__device__
+#ifdef DESUL_HIP_RDC
+    __constant__ extern
+#endif
+    int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE;
+
+#define HIP_SPACE_ATOMIC_MASK 0x1FFFF
+
+/// \brief Acquire a lock for the address
+///
+/// This function tries to acquire the lock for the hash value derived
+/// from the provided ptr. If the lock is successfully acquired the
+/// function returns true. Otherwise it returns false.
+__device__ inline bool lock_address_hip(void* ptr, desul::MemoryScopeDevice) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & HIP_SPACE_ATOMIC_MASK;
+  return (0 == atomicExch(&desul::Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE[offset], 1));
+}
+
+__device__ inline bool lock_address_hip(void* ptr, desul::MemoryScopeNode) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & HIP_SPACE_ATOMIC_MASK;
+  return (0 == atomicExch(&desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE[offset], 1));
+}
+
+/**
+ * \brief Release lock for the address
+ *
+ * This function releases the lock for the hash value derived from the provided
+ * ptr. This function should only be called after previously successfully
+ * acquiring a lock with lock_address.
+ */
+__device__ inline void unlock_address_hip(void* ptr, desul::MemoryScopeDevice) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & HIP_SPACE_ATOMIC_MASK;
+  atomicExch(&desul::Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE[offset], 0);
+}
+
+__device__ inline void unlock_address_hip(void* ptr, desul::MemoryScopeNode) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & HIP_SPACE_ATOMIC_MASK;
+  atomicExch(&desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE[offset], 0);
+}
+#endif
+}  // namespace Impl
+}  // namespace desul
+
+// Make lock_array_copied an explicit translation unit scope thing
+namespace desul {
+namespace Impl {
+namespace {
+static int lock_array_copied = 0;
+inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
+}  // namespace
+}  // namespace Impl
+}  // namespace desul
+
+/* It is critical that this code be a macro, so that it will
+   capture the right address for g_device_hip_lock_arrays!
+   putting this in an inline function will NOT do the right thing! */
+#define DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()                               \
+  {                                                                               \
+    if (::desul::Impl::lock_array_copied == 0) {                                  \
+      (void) hipMemcpyToSymbol(HIP_SYMBOL(::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE), \
+                        &::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE_h,          \
+                        sizeof(int32_t*));                                        \
+      (void) hipMemcpyToSymbol(HIP_SYMBOL(::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE),   \
+                        &::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE_h,            \
+                        sizeof(int32_t*));                                        \
+    }                                                                             \
+    ::desul::Impl::lock_array_copied = 1;                                         \
+  }
+
+#endif
+
+#if defined(DESUL_HIP_RDC) || (!defined(__HIPCC__))
+#define DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE()
+#else
+#define DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() \
+  DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()
+#endif
+
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/Macros.hpp b/packages/kokkos/core/src/desul/atomics/Macros.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..db9962e03bd84052a4d61a89cf39892e40051b89
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/Macros.hpp
@@ -0,0 +1,62 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_MACROS_HPP_
+#define DESUL_ATOMICS_MACROS_HPP_
+
+// Macros
+
+#if defined(__GNUC__) && \
+    (!defined(__CUDA_ARCH__) || !defined(__NVCC__)) && \
+    (!defined(__HIP_DEVICE_COMPILE) || !defined(__HIP_PLATFORM_HCC__)) && \
+    !defined(__SYCL_DEVICE_ONLY__) && \
+    !defined(DESUL_HAVE_OPENMP_ATOMICS) && \
+    !defined(DESUL_HAVE_SERIAL_ATOMICS)
+#define DESUL_HAVE_GCC_ATOMICS
+#endif
+
+#ifdef _MSC_VER
+#define DESUL_HAVE_MSVC_ATOMICS
+#endif
+
+#ifdef __CUDACC__
+#define DESUL_HAVE_CUDA_ATOMICS
+#endif
+
+#ifdef __HIPCC__
+#define DESUL_HAVE_HIP_ATOMICS
+#endif
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define DESUL_HAVE_SYCL_ATOMICS
+#ifdef __clang__
+#define DESUL_SYCL_NAMESPACE sycl::ONEAPI
+#else
+#define DESUL_SYCL_NAMESPACE sycl
+#endif
+#endif
+
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || defined(__SYCL_DEVICE_ONLY__)
+#define DESUL_HAVE_GPU_LIKE_PROGRESS
+#endif
+
+#if defined(DESUL_HAVE_CUDA_ATOMICS) || defined(DESUL_HAVE_HIP_ATOMICS)
+#define DESUL_FORCEINLINE_FUNCTION inline __host__ __device__
+#define DESUL_INLINE_FUNCTION inline __host__ __device__
+#define DESUL_FUNCTION __host__ __device__
+#else
+#define DESUL_FORCEINLINE_FUNCTION inline
+#define DESUL_INLINE_FUNCTION inline
+#define DESUL_FUNCTION
+#endif
+
+#if !defined(DESUL_HAVE_GPU_LIKE_PROGRESS)
+#define DESUL_HAVE_FORWARD_PROGRESS
+#endif
+
+#endif  // DESUL_ATOMICS_MACROS_HPP_
diff --git a/packages/kokkos/core/src/desul/atomics/OpenMP.hpp b/packages/kokkos/core/src/desul/atomics/OpenMP.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3fa22c36aca37e9e91e5b08aac9b3e61b8256ebc
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/OpenMP.hpp
@@ -0,0 +1,15 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_OPENMP_HPP_
+#define DESUL_ATOMICS_OPENMP_HPP_
+
+#ifdef DESUL_HAVE_OPENMP_ATOMICS
+
+#include<desul/atomics/openmp/OpenMP_40.hpp>
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/SYCL.hpp b/packages/kokkos/core/src/desul/atomics/SYCL.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..44e2dc0ec4ea843d6b6b4e9896b27fc63df6baad
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/SYCL.hpp
@@ -0,0 +1,143 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_SYCL_HPP_
+#define DESUL_ATOMICS_SYCL_HPP_
+
+#ifdef DESUL_HAVE_SYCL_ATOMICS
+#include "desul/atomics/Common.hpp"
+
+namespace desul {
+namespace Impl {
+template<class T>
+struct is_sycl_atomic_type {
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+				std::is_same<T, long>::value ||
+				std::is_same<T, unsigned long>::value ||
+				std::is_same<T, long long>::value ||
+                                std::is_same<T, unsigned long long int>::value ||
+				std::is_same<T, float>::value ||
+				std::is_same<T, double>::value;
+};
+} // Impl
+
+// Atomic Add
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_add(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T, 
+    DesulToSYCLMemoryOrder<MemoryOrder>::value, 
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,  
+    sycl::access::address_space::global_device_space> 
+  dest_ref(*dest);
+  return dest_ref.fetch_add(val);
+}
+
+// Atomic Sub 
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_sub(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*dest);
+  return dest_ref.fetch_sub(val);
+}
+
+// Atomic Inc
+template<class MemoryOrder/*, class MemoryScope*/>
+inline
+unsigned int atomic_fetch_inc(unsigned int* dest, unsigned int val, MemoryOrder memory_order, MemoryScopeDevice memory_scope) {
+  return atomic_fetch_add(dest, val, memory_order, memory_scope);
+}
+
+// Atomic Dec
+template<class MemoryOrder/*, class MemoryScope*/>
+inline
+unsigned int atomic_fetch_dec(unsigned int* dest, unsigned int val, MemoryOrder memory_order, MemoryScopeDevice memory_scope) {
+  return atomic_fetch_sub(dest, val, memory_order, memory_scope);
+}
+
+// Atomic Max
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_max(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*dest);
+  return dest_ref.fetch_max(val);
+}
+
+// Atomic Min
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_min(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*dest);
+  return dest_ref.fetch_min(val);
+}
+
+// Atomic And
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_and(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*dest);
+  return dest_ref.fetch_and(val);
+}
+
+// Atomic XOR
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_xor(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*dest);
+  return dest_ref.fetch_xor(val);
+}
+
+// Atomic OR
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_or(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*dest);
+  return dest_ref.fetch_or(val);
+}
+
+} // desul
+#endif  // DESUL_HAVE_SYCL_ATOMICS
+#endif  // DESUL_ATOMICS_SYCL_HPP_
diff --git a/packages/kokkos/core/src/desul/atomics/SYCLConversions.hpp b/packages/kokkos/core/src/desul/atomics/SYCLConversions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a66e5cf051f684695b17b7fae9fe2aaa5009a2c3
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/SYCLConversions.hpp
@@ -0,0 +1,58 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_SYCL_CONVERSIONS_HPP_
+#define DESUL_ATOMICS_SYCL_CONVERSIONS_HPP_
+#ifdef DESUL_HAVE_SYCL_ATOMICS
+#include "desul/atomics/Common.hpp"
+#include <CL/sycl.hpp>
+
+namespace desul {
+
+template<class MemoryOrder>
+struct DesulToSYCLMemoryOrder;
+template<>
+struct DesulToSYCLMemoryOrder<MemoryOrderSeqCst> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_order value = DESUL_SYCL_NAMESPACE::memory_order::seq_cst;
+};
+template<>
+struct DesulToSYCLMemoryOrder<MemoryOrderAcquire> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_order value = DESUL_SYCL_NAMESPACE::memory_order::acquire;
+};
+template<>
+struct DesulToSYCLMemoryOrder<MemoryOrderRelease> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_order value = DESUL_SYCL_NAMESPACE::memory_order::release;
+};
+template<>
+struct DesulToSYCLMemoryOrder<MemoryOrderAcqRel> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_order value = DESUL_SYCL_NAMESPACE::memory_order::acq_rel;
+};
+template<>
+struct DesulToSYCLMemoryOrder<MemoryOrderRelaxed> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_order value = DESUL_SYCL_NAMESPACE::memory_order::relaxed;
+};
+
+template<class MemoryScope>
+struct DesulToSYCLMemoryScope;
+template<>
+struct DesulToSYCLMemoryScope<MemoryScopeCore> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_scope value = DESUL_SYCL_NAMESPACE::memory_scope::work_group;
+};
+template<>
+struct DesulToSYCLMemoryScope<MemoryScopeDevice> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_scope value = DESUL_SYCL_NAMESPACE::memory_scope::device;
+};
+template<>
+struct DesulToSYCLMemoryScope<MemoryScopeSystem> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_scope value = DESUL_SYCL_NAMESPACE::memory_scope::system;
+};
+
+}
+
+#endif
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/CUDA_asm.hpp b/packages/kokkos/core/src/desul/atomics/cuda/CUDA_asm.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..461d3e0928a19840973d1c971ccd100968193a42
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/CUDA_asm.hpp
@@ -0,0 +1,18 @@
+#include<limits>
+namespace desul {
+#if defined(__CUDA_ARCH__)  || (defined(__clang__) && !defined(__NVCC__))
+// Choose the variant of atomics we are using later
+#if !defined(DESUL_IMPL_ATOMIC_CUDA_PTX_GENERIC) && \
+    !defined(DESUL_IMPL_ATOMIC_CUDA_PTX_PREDICATE) && \
+    !defined(DESUL_IMPL_ATOMIC_CUDA_PTX_ISGLOBAL) && \
+    !defined(DESUL_IMPL_ATOMIC_CUDA_PTX_FORCEGLOBAL)
+#if (__CUDACC_VER_MAJOR__ > 11) || ((__CUDACC_VER_MAJOR__==11) && (__CUDACC_VER_MINOR__>1))
+#define DESUL_IMPL_ATOMIC_CUDA_PTX_ISGLOBAL
+#else
+#define DESUL_IMPL_ATOMIC_CUDA_PTX_PREDICATE
+#endif
+#endif
+#include<desul/atomics/cuda/cuda_cc7_asm.inc>
+
+#endif
+}
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/CUDA_asm_exchange.hpp b/packages/kokkos/core/src/desul/atomics/cuda/CUDA_asm_exchange.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0ab95e6a00a4e67ffad0232f8652e8a0c9d4f6e6
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/CUDA_asm_exchange.hpp
@@ -0,0 +1,8 @@
+#include<limits>
+namespace desul {
+#if defined(__CUDA_ARCH__)  || (defined(__clang__) && !defined(__NVCC__))
+
+#include<desul/atomics/cuda/cuda_cc7_asm_exchange.inc>
+
+#endif
+}
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm.inc b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm.inc
new file mode 100644
index 0000000000000000000000000000000000000000..2bc64a74b2caf06a731a319c07c7b26490924aa3
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm.inc
@@ -0,0 +1,20 @@
+
+// Non returning atomic operation (ptx red instruction) only exists for relaxed and release memorder
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE MemoryScopeDevice
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".gpu"
+#include "desul/atomics/cuda/cuda_cc7_asm_memorder.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE MemoryScopeNode
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".sys"
+#include "desul/atomics/cuda/cuda_cc7_asm_memorder.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE MemoryScopeCore
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".cta"
+#include "desul/atomics/cuda/cuda_cc7_asm_memorder.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM
+
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc
new file mode 100644
index 0000000000000000000000000000000000000000..6de590a952fa38f60c58a34c9499a420502ae381
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc
@@ -0,0 +1,18 @@
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_GENERIC
+#include "cuda_cc7_asm_atomic_fetch_op.inc_generic"
+#endif
+
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_ISGLOBAL
+#include "cuda_cc7_asm_atomic_fetch_op.inc_isglobal"
+#endif
+
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_PREDICATE
+#include "cuda_cc7_asm_atomic_fetch_op.inc_predicate"
+#endif
+
+// This version is not generally safe
+// Only here for performance comparison purposes
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_FORCEGLOBAL
+#include "cuda_cc7_asm_atomic_fetch_op.inc_forceglobal"
+#endif
+
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal
new file mode 100644
index 0000000000000000000000000000000000000000..d00e2223d22485f4ee831dd53dce064a49deec5e
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal
@@ -0,0 +1,143 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// binary operations
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+// Fetch atomics
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_add(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile("atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_sub(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  ctype neg_value = -value; \
+  asm volatile("atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(neg_value) : "memory"); \
+  return result; \
+}
+
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_min(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile("atom.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_max(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile("atom.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_inc(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  asm volatile("atom.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_dec(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  asm volatile("atom.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  return result; \
+}
+
+// Group ops for integer ctypes
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR(ctype)
+
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(double,".f64","d","=d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(double,".f64","d","=d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int32_t,".s32","r","=r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int64_t,".s64","l","=l")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint32_t,".u32","r","=r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP()
+
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND
+
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic
new file mode 100644
index 0000000000000000000000000000000000000000..364b6a2e4d1950f110c29958976a660d01d05771
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic
@@ -0,0 +1,142 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops: 
+
+// binary operations
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.and" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.and" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.or" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.or" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.xor" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.xor" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+// Fetch atomics
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_add(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile("atom.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_sub(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  ctype neg_value = -value; \
+  asm volatile("atom.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(neg_value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_min(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile("atom.min" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_max(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile("atom.max" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_inc(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  asm volatile("atom.inc" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_dec(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  asm volatile("atom.dec" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  return result; \
+}
+
+// Group ops for integer ctypes
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR(ctype)
+
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(double,".f64","d","=d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(double,".f64","d","=d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int32_t,".s32","r","=r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int64_t,".s64","l","=l")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint32_t,".u32","r","=r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP()
+
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND
+
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal
new file mode 100644
index 0000000000000000000000000000000000000000..2e8e54062dd3494f7440b959618359ef0547d87b
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal
@@ -0,0 +1,190 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// binary operations
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  } else { \
+  asm volatile("atom.and"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  } \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  } else { \
+  asm volatile("atom.and"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  } \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  } else { \
+  asm volatile("atom.or"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  } \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  } else { \
+  asm volatile("atom.or"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  } \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  } else { \
+  asm volatile("atom.xor"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  } \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  } else { \
+  asm volatile("atom.xor"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  } \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+// Fetch atomics
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_add(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  } else { \
+  asm volatile("atom.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  } \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_sub(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  ctype neg_value = -value; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(neg_value) : "memory"); \
+  } else { \
+  asm volatile("atom.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(neg_value) : "memory"); \
+  } \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_min(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  } else { \
+  asm volatile("atom.min"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  } \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_max(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  } else { \
+  asm volatile("atom.max"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  } \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_inc(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  } else { \
+  asm volatile("atom.inc"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  } \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_dec(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  } else { \
+  asm volatile("atom.dec"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  } \
+  return result; \
+}
+
+// Group ops for integer ctypes
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR(ctype)
+
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(double,".f64","d","=d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(double,".f64","d","=d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int32_t,".s32","r","=r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int64_t,".s64","l","=l")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint32_t,".u32","r","=r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP()
+
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND
+
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate
new file mode 100644
index 0000000000000000000000000000000000000000..5f53279daf541aad169e1bc5a046518cc5b084c8
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate
@@ -0,0 +1,226 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// binary operations
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;\n\t" \
+          "@!p atom.and"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;\n\t" \
+          "@!p atom.and"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;\n\t" \
+          "@!p atom.or"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;\n\t" \
+          "@!p atom.or"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;\n\t" \
+          "@!p atom.xor"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;\n\t" \
+          "@!p atom.xor"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+// Fetch atomics
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_add(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "@!p atom.add"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_sub(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  ctype neg_value = -value; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "@!p atom.add"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : reg_ret_ctype(result) : "l"(dest),reg_ctype(neg_value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_min(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "@!p atom.min"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_max(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "@!p atom.max"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_inc(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.inc.gobal" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "@!p atom.inc"       __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_dec(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "@!p atom.dec"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  return result; \
+}
+
+// Group ops for integer ctypes
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR(ctype)
+
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(double,".f64","d","=d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(double,".f64","d","=d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int32_t,".s32","r","=r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int64_t,".s64","l","=l")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint32_t,".u32","r","=r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP()
+
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND
+
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc
new file mode 100644
index 0000000000000000000000000000000000000000..ca02410515db3491c002b569f01ee150d1c0c683
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc
@@ -0,0 +1,18 @@
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_GENERIC
+#include "cuda_cc7_asm_atomic_op.inc_generic"
+#endif
+
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_ISGLOBAL
+#include "cuda_cc7_asm_atomic_op.inc_isglobal"
+#endif
+
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_PREDICATE
+#include "cuda_cc7_asm_atomic_op.inc_predicate"
+#endif
+
+// This version is not generally safe
+// Only here for performance comparison purposes
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_FORCEGLOBAL
+#include "cuda_cc7_asm_atomic_op.inc_forceglobal"
+#endif
+
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal
new file mode 100644
index 0000000000000000000000000000000000000000..3767b2ab4980c0d811357a6d0e1a912de5bba500
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal
@@ -0,0 +1,64 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// Non Returning Atomic Operations
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+inline __device__ void atomic_add(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile("red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+inline __device__ void atomic_sub(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type neg_value = -value; \
+  asm volatile("red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(neg_value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+inline __device__ void atomic_min(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile("red.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+inline __device__ void atomic_max(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile("red.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+inline __device__ void atomic_inc(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  asm volatile("red.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type) \
+inline __device__ void atomic_dec(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  asm volatile("red.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+}
+
+// Group ops for integer types
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type)
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(double,".f64","d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(double,".f64","d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(uint32_t,".u32","r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int32_t,".s32","r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".s64","l")
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic
new file mode 100644
index 0000000000000000000000000000000000000000..5de36a3e0a87b967fff5d9a936644c5e8a566051
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic
@@ -0,0 +1,64 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// Non Returning Atomic Operations
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+inline __device__ void atomic_add(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile("red.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+inline __device__ void atomic_sub(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type neg_value = -value; \
+  asm volatile("red.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(neg_value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+inline __device__ void atomic_min(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile("red.min" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+inline __device__ void atomic_max(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile("red.max" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+inline __device__ void atomic_inc(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  asm volatile("red.inc" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type) \
+inline __device__ void atomic_dec(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  asm volatile("red.dec" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+}
+
+// Group ops for integer types
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type)
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(double,".f64","d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(double,".f64","d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(uint32_t,".u32","r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int32_t,".s32","r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".s64","l")
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal
new file mode 100644
index 0000000000000000000000000000000000000000..ba8937883423e62b9461b6d764d24022531db719
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal
@@ -0,0 +1,88 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// Non Returning Atomic Operations
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+inline __device__ void atomic_add(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  if(__isGlobal(dest)) { \
+  asm volatile("red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+  } else { \
+  asm volatile("red.add"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+  } \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+inline __device__ void atomic_sub(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type neg_value = -value; \
+  if(__isGlobal(dest)) { \
+  asm volatile("red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(neg_value) : "memory"); \
+  } else { \
+  asm volatile("red.add"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(neg_value) : "memory"); \
+  } \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+inline __device__ void atomic_min(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  if(__isGlobal(dest)) { \
+  asm volatile("red.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+  } else { \
+  asm volatile("red.min"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+  } \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+inline __device__ void atomic_max(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  if(__isGlobal(dest)) { \
+  asm volatile("red.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+  } else { \
+  asm volatile("red.max"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+  } \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+inline __device__ void atomic_inc(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  if(__isGlobal(dest)) { \
+  asm volatile("red.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+  } else { \
+  asm volatile("red.inc"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+  } \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type) \
+inline __device__ void atomic_dec(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  if(__isGlobal(dest)) { \
+  asm volatile("red.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+  } else { \
+  asm volatile("red.dec"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+  } \
+}
+
+// Group ops for integer types
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type)
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(double,".f64","d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(double,".f64","d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(uint32_t,".u32","r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int32_t,".s32","r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".s64","l")
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate
new file mode 100644
index 0000000000000000000000000000000000000000..46e0ccf5e747e151039f68e17a72c82a05fc14fc
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate
@@ -0,0 +1,106 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// Non Returning Atomic Operations
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+inline __device__ void atomic_add(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %0;\n\t" \
+          "@p  red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "@!p red.add"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "}\n\t" \
+    :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+inline __device__ void atomic_sub(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type neg_value = -value; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %0;\n\t" \
+          "@p  red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "@!p red.add"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "}\n\t" \
+    :: "l"(dest),reg_type(neg_value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+inline __device__ void atomic_min(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %0;\n\t" \
+          "@p  red.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "@!p red.min"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "}\n\t" \
+    :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+inline __device__ void atomic_max(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %0;\n\t" \
+          "@p  red.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "@!p red.max"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "}\n\t" \
+    :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+inline __device__ void atomic_inc(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %0;\n\t" \
+          "@p  red.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "@!p red.inc"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "}\n\t" \
+    :: "l"(dest),reg_type(limit) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type) \
+inline __device__ void atomic_dec(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %0;\n\t" \
+          "@p  red.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "@!p red.dec"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "}\n\t" \
+    :: "l"(dest),reg_type(limit) : "memory"); \
+}
+
+// Group ops for integer types
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type)
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(double,".f64","d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(double,".f64","d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(uint32_t,".u32","r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int32_t,".s32","r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".s64","l")
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange.inc b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange.inc
new file mode 100644
index 0000000000000000000000000000000000000000..dfd211249fcc625733e1737d9f2258bf9d066ef9
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange.inc
@@ -0,0 +1,20 @@
+
+// Non returning atomic operation (ptx red instruction) only exists for relaxed and release memorder
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE MemoryScopeDevice
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".gpu"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE MemoryScopeNode
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".sys"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE MemoryScopeCore
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".cta"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM
+
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc
new file mode 100644
index 0000000000000000000000000000000000000000..7b4f7d094e8ac566213b6f11938dab721da003fa
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc
@@ -0,0 +1,27 @@
+
+// Non returning atomic operation (ptx red instruction) only exists for relaxed and release memorder
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderRelaxed
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".relaxed"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderRelease
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".release"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderAcquire
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".acquire"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderAcqRel
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".acq_rel"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc
new file mode 100644
index 0000000000000000000000000000000000000000..51d992087e35f8842fbc143f8e903273499d4507
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc
@@ -0,0 +1,40 @@
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_EXCHANGE() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_exchange(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.exch" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_exchange(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.exch" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_COMPARE_EXCHANGE() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_compare_exchange(ctype* dest, ctype compare, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_compare = reinterpret_cast<uint32_t&>(compare); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.cas" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2,%3;" : "=r"(asm_result) : "l"(dest),"r"(asm_compare),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_compare_exchange(ctype* dest, ctype compare, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_compare = reinterpret_cast<uint64_t&>(compare); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.cas" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2,%3;" : "=l"(asm_result) : "l"(dest),"l"(asm_compare),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_EXCHANGE()
+__DESUL_IMPL_CUDA_ASM_ATOMIC_COMPARE_EXCHANGE()
+
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_EXCHANGE
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_COMPARE_EXCHANGE
diff --git a/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_memorder.inc b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_memorder.inc
new file mode 100644
index 0000000000000000000000000000000000000000..3eb613d8a74c5bfc911bd51787dac9c4a972f698
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_memorder.inc
@@ -0,0 +1,29 @@
+
+// Non returning atomic operation (ptx red instruction) only exists for relaxed and release memorder
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderRelaxed
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".relaxed"
+#include "desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc"
+#include "desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderRelease
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".release"
+#include "desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc"
+#include "desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderAcquire
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".acquire"
+#include "desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderAcqRel
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".acq_rel"
+#include "desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
diff --git a/packages/kokkos/core/src/desul/atomics/openmp/OpenMP_40.hpp b/packages/kokkos/core/src/desul/atomics/openmp/OpenMP_40.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f4f1bbd96ee6b73d5ec15372256bb4177c033234
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/openmp/OpenMP_40.hpp
@@ -0,0 +1,97 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_OPENMP40_HPP_
+#define DESUL_ATOMICS_OPENMP40_HPP_
+#include<type_traits>
+
+namespace desul {
+namespace Impl {
+  template<class MEMORY_ORDER_TMP, class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_pre_capture_flush(MEMORY_ORDER_TMP, MEMORY_SCOPE_TMP) {}
+  template<class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_pre_capture_flush(MemoryOrderAcquire, MEMORY_SCOPE_TMP) {
+    atomic_thread_fence(MemoryOrderAcquire(), MEMORY_SCOPE_TMP());
+  }
+  template<class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_pre_capture_flush(MemoryOrderAcqRel, MEMORY_SCOPE_TMP) {
+    atomic_thread_fence(MemoryOrderAcqRel(), MEMORY_SCOPE_TMP());
+  }
+  template<class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_pre_capture_flush(MemoryOrderSeqCst, MEMORY_SCOPE_TMP) {
+    atomic_thread_fence(MemoryOrderSeqCst(), MEMORY_SCOPE_TMP());
+  }
+
+  template<class MEMORY_ORDER_TMP, class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_post_capture_flush(MEMORY_ORDER_TMP, MEMORY_SCOPE_TMP) {}
+  template<class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_post_capture_flush(MemoryOrderRelease, MEMORY_SCOPE_TMP) {
+    atomic_thread_fence(MemoryOrderRelease(), MEMORY_SCOPE_TMP());
+  }
+  template<class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_post_capture_flush(MemoryOrderAcqRel, MEMORY_SCOPE_TMP) {
+    atomic_thread_fence(MemoryOrderAcqRel(), MEMORY_SCOPE_TMP());
+  }
+  template<class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_post_capture_flush(MemoryOrderSeqCst, MEMORY_SCOPE_TMP) {
+    atomic_thread_fence(MemoryOrderSeqCst(), MEMORY_SCOPE_TMP());
+  }
+
+  template<class T>
+  struct is_openmp_atomic_type_t {
+    static constexpr bool value = std::is_arithmetic<T>::value;
+  };
+  template<class T>
+  constexpr bool is_openmp_atomic_type_v = is_openmp_atomic_type_t<T>::value;
+}
+}
+
+namespace desul {
+// Can't use a macro approach to get all definitions since the ops include #pragma omp
+// So gonna use multiple inclusion of the same code snippet here.
+
+// Can't do Node level atomics this way with OpenMP Target, but we could 
+// have a define which says whether or not Device level IS node level (e.g. for pure CPU node)
+
+#define MEMORY_ORDER MemoryOrderRelaxed
+// #define MEMORY_SCOPE MemoryScopeNode
+// #include<desul/atomics/openmp/OpenMP_40_op.inc>
+// #undef MEMORY_SCOPE
+#define MEMORY_SCOPE MemoryScopeDevice
+#include<desul/atomics/openmp/OpenMP_40_op.inc>
+#undef MEMORY_SCOPE
+#define MEMORY_SCOPE MemoryScopeCore
+#include<desul/atomics/openmp/OpenMP_40_op.inc>
+#undef MEMORY_SCOPE
+#undef MEMORY_ORDER
+
+#define MEMORY_ORDER MemoryOrderAcqRel
+// #define MEMORY_SCOPE MemoryScopeNode
+// #include<desul/atomics/openmp/OpenMP_40_op.inc>
+// #undef MEMORY_SCOPE
+#define MEMORY_SCOPE MemoryScopeDevice
+#include<desul/atomics/openmp/OpenMP_40_op.inc>
+#undef MEMORY_SCOPE
+#define MEMORY_SCOPE MemoryScopeCore
+#include<desul/atomics/openmp/OpenMP_40_op.inc>
+#undef MEMORY_SCOPE
+#undef MEMORY_ORDER
+
+#define MEMORY_ORDER MemoryOrderSeqCst
+// #define MEMORY_SCOPE MemoryScopeNode
+// #include<desul/atomics/openmp/OpenMP_40_op.inc>
+// #undef MEMORY_SCOPE
+#define MEMORY_SCOPE MemoryScopeDevice
+#include<desul/atomics/openmp/OpenMP_40_op.inc>
+#undef MEMORY_SCOPE
+#define MEMORY_SCOPE MemoryScopeCore
+#include<desul/atomics/openmp/OpenMP_40_op.inc>
+#undef MEMORY_SCOPE
+#undef MEMORY_ORDER
+}  // namespace desul
+#endif
diff --git a/packages/kokkos/core/src/desul/atomics/openmp/OpenMP_40_op.inc b/packages/kokkos/core/src/desul/atomics/openmp/OpenMP_40_op.inc
new file mode 100644
index 0000000000000000000000000000000000000000..a65f2a457dff8b2ec6e186411359e73b729fb5e1
--- /dev/null
+++ b/packages/kokkos/core/src/desul/atomics/openmp/OpenMP_40_op.inc
@@ -0,0 +1,101 @@
+
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_fetch_add(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());
+    #pragma omp atomic capture                                                    
+    { tmp = *dest;  *dest += value; }                                             
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_fetch_sub(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { tmp = *dest;  *dest -= value; }                                             
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_fetch_and(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    #pragma omp atomic capture                                                    
+    { tmp = *dest;  *dest &= value; }                                             
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_fetch_or(   
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { tmp = *dest;  *dest |= value; }                                             
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_fetch_xor(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { tmp = *dest;  *dest ^= value; }                                             
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_add_fetch(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { *dest += value; tmp = *dest; }                                              
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_sub_fetch(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { *dest -= value; tmp = *dest; }                                              
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_and_fetch(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { *dest &= value; tmp = *dest; }                                              
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_or_fetch(   
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { *dest |= value; tmp = *dest; }                                              
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_xor_fetch(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { *dest ^= value; tmp = *dest; }                                              
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }
diff --git a/packages/kokkos/core/src/desul/src/Lock_Array_CUDA.cpp b/packages/kokkos/core/src/desul/src/Lock_Array_CUDA.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8913f8bc7b80fc11844438f384582e7a036c824f
--- /dev/null
+++ b/packages/kokkos/core/src/desul/src/Lock_Array_CUDA.cpp
@@ -0,0 +1,98 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#include <desul/atomics/Lock_Array.hpp>
+#include <cinttypes>
+#include <string>
+#include <sstream>
+
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+#ifdef __CUDACC_RDC__
+namespace desul {
+namespace Impl {
+__device__ __constant__ int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE = nullptr;
+__device__ __constant__ int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE = nullptr;
+}
+}  // namespace desul
+#endif
+
+namespace desul {
+
+namespace {
+
+__global__ void init_lock_arrays_cuda_kernel() {
+  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < CUDA_SPACE_ATOMIC_MASK + 1) {
+    Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE[i] = 0;
+    Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE[i] = 0;
+  }
+}
+
+}  // namespace
+
+namespace Impl {
+
+
+int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
+int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
+
+// Putting this into anonymous namespace so we don't have multiple defined symbols
+// When linking in more than one copy of the object file
+namespace {
+
+void check_error_and_throw_cuda(cudaError e, const std::string msg) {
+  if(e != cudaSuccess) {
+    std::ostringstream out;
+    out << "Desul::Error: " << msg << " error(" << cudaGetErrorName(e)
+                  << "): " << cudaGetErrorString(e);
+    throw std::runtime_error(out.str());
+  }
+}
+
+}
+
+// define functions
+template<typename T>
+void init_lock_arrays_cuda() {
+  if (CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h != nullptr) return;
+  auto error_malloc1 = cudaMalloc(&CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h,
+                                 sizeof(int32_t) * (CUDA_SPACE_ATOMIC_MASK + 1));
+  check_error_and_throw_cuda(error_malloc1, "init_lock_arrays_cuda: cudaMalloc device locks");
+
+  auto error_malloc2 = cudaMallocHost(&CUDA_SPACE_ATOMIC_LOCKS_NODE_h,
+                                 sizeof(int32_t) * (CUDA_SPACE_ATOMIC_MASK + 1));
+  check_error_and_throw_cuda(error_malloc2, "init_lock_arrays_cuda: cudaMalloc host locks");
+
+  auto error_sync1 = cudaDeviceSynchronize();
+  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
+  check_error_and_throw_cuda(error_sync1, "init_lock_arrays_cuda: post mallocs");
+  init_lock_arrays_cuda_kernel<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>();
+  auto error_sync2 = cudaDeviceSynchronize();
+  check_error_and_throw_cuda(error_sync2, "init_lock_arrays_cuda: post init kernel");
+}
+
+template<typename T>
+void finalize_lock_arrays_cuda() {
+  if (CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h == nullptr) return;
+  cudaFree(CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h);
+  cudaFreeHost(CUDA_SPACE_ATOMIC_LOCKS_NODE_h);
+  CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
+  CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
+#ifdef __CUDACC_RDC__
+  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
+#endif
+}
+
+// Instantiate functions
+template void init_lock_arrays_cuda<int>();
+template void finalize_lock_arrays_cuda<int>();
+
+}  // namespace Impl
+
+}  // namespace desul
+#endif
diff --git a/packages/kokkos/core/src/desul/src/Lock_Array_HIP.cpp b/packages/kokkos/core/src/desul/src/Lock_Array_HIP.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..40030df643fa1e85a0fe433b0135386418590193
--- /dev/null
+++ b/packages/kokkos/core/src/desul/src/Lock_Array_HIP.cpp
@@ -0,0 +1,101 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#include <cinttypes>
+#include <desul/atomics/Lock_Array.hpp>
+#include <string>
+#include <sstream>
+
+#ifdef DESUL_HAVE_HIP_ATOMICS
+#ifdef DESUL_HIP_RDC
+namespace desul {
+namespace Impl {
+__device__ __constant__ int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE = nullptr;
+__device__ __constant__ int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE = nullptr;
+}  // namespace Impl
+}  // namespace desul
+#endif
+
+namespace desul {
+
+namespace {
+
+__global__ void init_lock_arrays_hip_kernel() {
+  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < HIP_SPACE_ATOMIC_MASK + 1) {
+    Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE[i] = 0;
+    Impl::HIP_SPACE_ATOMIC_LOCKS_NODE[i] = 0;
+  }
+}
+
+}  // namespace
+
+namespace Impl {
+
+int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
+int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
+
+// Putting this into anonymous namespace so we don't have multiple defined symbols
+// When linking in more than one copy of the object file
+namespace {
+
+void check_error_and_throw_hip(hipError_t e, const std::string msg) {
+  if(e != hipSuccess) {
+    std::ostringstream out;
+    out << "Desul::Error: " << msg << " error(" << hipGetErrorName(e)
+                  << "): " << hipGetErrorString(e);
+    throw std::runtime_error(out.str());
+  }
+}
+
+}
+
+template<typename T>
+void init_lock_arrays_hip() {
+  if (HIP_SPACE_ATOMIC_LOCKS_DEVICE_h != nullptr) return;
+
+  auto error_malloc1 = hipMalloc(&HIP_SPACE_ATOMIC_LOCKS_DEVICE_h,
+            sizeof(int32_t) * (HIP_SPACE_ATOMIC_MASK + 1));
+  check_error_and_throw_hip(error_malloc1, "init_lock_arrays_hip: hipMalloc device locks");
+
+  auto error_malloc2 = hipHostMalloc(&HIP_SPACE_ATOMIC_LOCKS_NODE_h,
+                sizeof(int32_t) * (HIP_SPACE_ATOMIC_MASK + 1));
+  check_error_and_throw_hip(error_malloc2, "init_lock_arrays_hip: hipMallocHost host locks");
+
+  auto error_sync1 = hipDeviceSynchronize();
+  DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE();
+  check_error_and_throw_hip(error_sync1, "init_lock_arrays_hip: post malloc");
+
+  init_lock_arrays_hip_kernel<<<(HIP_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>();
+
+  auto error_sync2 = hipDeviceSynchronize();
+  check_error_and_throw_hip(error_sync2, "init_lock_arrays_hip: post init");
+}
+
+template<typename T>
+void finalize_lock_arrays_hip() {
+  if (HIP_SPACE_ATOMIC_LOCKS_DEVICE_h == nullptr) return;
+  auto error_free1 = hipFree(HIP_SPACE_ATOMIC_LOCKS_DEVICE_h);
+  check_error_and_throw_hip(error_free1, "finalize_lock_arrays_hip: free device locks");
+  auto error_free2 = hipHostFree(HIP_SPACE_ATOMIC_LOCKS_NODE_h);
+  check_error_and_throw_hip(error_free2, "finalize_lock_arrays_hip: free host locks");
+  HIP_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
+  HIP_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
+#ifdef DESUL_HIP_RDC
+  DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE();
+#endif
+}
+
+template void init_lock_arrays_hip<int>();
+template void finalize_lock_arrays_hip<int>();
+
+}  // namespace Impl
+
+}  // namespace desul
+#endif
+
diff --git a/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp b/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp
index 7754daa8a0189a3d0708ce6505955be4b76b2d61..0ce680cd69efb1de0e9cbebadfda1f739e325630 100644
--- a/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp
+++ b/packages/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp
@@ -52,6 +52,8 @@ class SYCLDeviceUSMSpace;  ///< Memory space on SYCL device, not accessible from
                            ///< the host
 class SYCLSharedUSMSpace;  ///< Memory space accessible from both the SYCL
                            ///< device and the host
+class SYCLHostUSMSpace;    ///< Memory space accessible from both the SYCL
+                           ///< device and the host (host pinned)
 class SYCL;                ///< Execution space for SYCL
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp b/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
index 7f72b3983f57c9adea157cf70d815339696cd986..5167c9ed65b42b1e567286849f37e89616e0e980 100644
--- a/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
+++ b/packages/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
@@ -1518,28 +1518,14 @@ struct Tile_Loop_Type<
 };
 // end Structs for calling loops
 
-template <typename T>
-using is_void_type = std::is_same<T, void>;
-
-template <typename T>
-struct is_type_array : std::false_type {
-  using value_type = T;
-};
-
-template <typename T>
-struct is_type_array<T[]> : std::true_type {
-  using value_type = T;
-};
-
 template <typename RP, typename Functor, typename Tag = void,
           typename ValueType = void, typename Enable = void>
 struct HostIterateTile;
 
 // For ParallelFor
 template <typename RP, typename Functor, typename Tag, typename ValueType>
-struct HostIterateTile<
-    RP, Functor, Tag, ValueType,
-    typename std::enable_if<is_void_type<ValueType>::value>::type> {
+struct HostIterateTile<RP, Functor, Tag, ValueType,
+                       std::enable_if_t<std::is_void<ValueType>::value>> {
   using index_type = typename RP::index_type;
   using point_type = typename RP::point_type;
 
@@ -1947,10 +1933,9 @@ struct HostIterateTile<
 // For ParallelReduce
 // ValueType - scalar: For reductions
 template <typename RP, typename Functor, typename Tag, typename ValueType>
-struct HostIterateTile<
-    RP, Functor, Tag, ValueType,
-    typename std::enable_if<!is_void_type<ValueType>::value &&
-                            !is_type_array<ValueType>::value>::type> {
+struct HostIterateTile<RP, Functor, Tag, ValueType,
+                       std::enable_if_t<!std::is_void<ValueType>::value &&
+                                        !std::is_array<ValueType>::value>> {
   using index_type = typename RP::index_type;
   using point_type = typename RP::point_type;
 
@@ -2370,17 +2355,16 @@ struct HostIterateTile<
 // Extra specialization for array reductions
 // ValueType[]: For array reductions
 template <typename RP, typename Functor, typename Tag, typename ValueType>
-struct HostIterateTile<
-    RP, Functor, Tag, ValueType,
-    typename std::enable_if<!is_void_type<ValueType>::value &&
-                            is_type_array<ValueType>::value>::type> {
+struct HostIterateTile<RP, Functor, Tag, ValueType,
+                       std::enable_if_t<!std::is_void<ValueType>::value &&
+                                        std::is_array<ValueType>::value>> {
   using index_type = typename RP::index_type;
   using point_type = typename RP::point_type;
 
   using value_type =
-      typename is_type_array<ValueType>::value_type;  // strip away the
-                                                      // 'array-ness' [], only
-                                                      // underlying type remains
+      std::remove_extent_t<ValueType>;  // strip away the
+                                        // 'array-ness' [], only
+                                        // underlying type remains
 
   inline HostIterateTile(
       RP const& rp, Functor const& func,
diff --git a/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp b/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
index c513817b5b8cbd74847e180099081bb475020c44..20fc6268c7dcd30c5c8deac332c61be673c7cc3d 100644
--- a/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
@@ -63,12 +63,39 @@
 namespace Kokkos {
 namespace Impl {
 
+//==============================================================================
+// <editor-fold desc="AnalyzePolicyBaseTraits"> {{{1
+
+// Mix in the defaults (base_traits) for the traits that aren't yet handled
+
 //------------------------------------------------------------------------------
+// <editor-fold desc="MSVC EBO failure workaround"> {{{2
+
+template <class TraitSpecList>
+struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION AnalyzeExecPolicyBaseTraits;
+template <class... TraitSpecifications>
+struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION
+    AnalyzeExecPolicyBaseTraits<type_list<TraitSpecifications...>>
+    : TraitSpecifications::base_traits... {};
+
+// </editor-fold> end AnalyzePolicyBaseTraits }}}1
+//==============================================================================
 
-using execution_policy_trait_specifications =
-    type_list<ExecutionSpaceTrait, GraphKernelTrait, IndexTypeTrait,
-              IterationPatternTrait, LaunchBoundsTrait, OccupancyControlTrait,
-              ScheduleTrait, WorkItemPropertyTrait, WorkTagTrait>;
+//==============================================================================
+// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+
+//------------------------------------------------------------------------------
+// Note: unspecialized, so that the default pathway is to fall back to using
+// the PolicyTraitMatcher. See AnalyzeExecPolicyUseMatcher below
+template <class Enable, class... Traits>
+struct AnalyzeExecPolicy
+    : AnalyzeExecPolicyUseMatcher<void, execution_policy_trait_specifications,
+                                  Traits...> {
+  using base_t =
+      AnalyzeExecPolicyUseMatcher<void, execution_policy_trait_specifications,
+                                  Traits...>;
+  using base_t::base_t;
+};
 
 //------------------------------------------------------------------------------
 // Ignore void for backwards compatibility purposes, though hopefully no one is
@@ -81,15 +108,6 @@ struct AnalyzeExecPolicy<void, void, Traits...>
 };
 
 //------------------------------------------------------------------------------
-// Mix in the defaults (base_traits) for the traits that aren't yet handled
-
-template <class TraitSpecList>
-struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION AnalyzeExecPolicyBaseTraits;
-template <class... TraitSpecifications>
-struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION
-    AnalyzeExecPolicyBaseTraits<type_list<TraitSpecifications...>>
-    : TraitSpecifications::base_traits... {};
-
 template <>
 struct AnalyzeExecPolicy<void>
     : AnalyzeExecPolicyBaseTraits<execution_policy_trait_specifications> {
@@ -108,6 +126,68 @@ struct AnalyzeExecPolicy<void>
   }
 };
 
+// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+//==============================================================================
+
+//==============================================================================
+// <editor-fold desc="AnalyzeExecPolicyUseMatcher"> {{{1
+
+// We can avoid having to have policies specialize AnalyzeExecPolicy themselves
+// by piggy-backing off of the PolicyTraitMatcher that we need to have for
+// things like require() anyway. We mixin the effects of the trait using
+// the `mixin_matching_trait` nested alias template in the trait specification
+
+// General PolicyTraitMatcher version
+
+// Matching case
+template <class TraitSpec, class... TraitSpecs, class Trait, class... Traits>
+struct AnalyzeExecPolicyUseMatcher<
+    std::enable_if_t<PolicyTraitMatcher<TraitSpec, Trait>::value>,
+    type_list<TraitSpec, TraitSpecs...>, Trait, Traits...>
+    : TraitSpec::template mixin_matching_trait<
+          Trait, AnalyzeExecPolicy<void, Traits...>> {
+  using base_t = typename TraitSpec::template mixin_matching_trait<
+      Trait, AnalyzeExecPolicy<void, Traits...>>;
+  using base_t::base_t;
+};
+
+// Non-matching case
+template <class TraitSpec, class... TraitSpecs, class Trait, class... Traits>
+struct AnalyzeExecPolicyUseMatcher<
+    std::enable_if_t<!PolicyTraitMatcher<TraitSpec, Trait>::value>,
+    type_list<TraitSpec, TraitSpecs...>, Trait, Traits...>
+    : AnalyzeExecPolicyUseMatcher<void, type_list<TraitSpecs...>, Trait,
+                                  Traits...> {
+  using base_t = AnalyzeExecPolicyUseMatcher<void, type_list<TraitSpecs...>,
+                                             Trait, Traits...>;
+  using base_t::base_t;
+};
+
+// No match found case:
+template <class>
+struct show_name_of_invalid_execution_policy_trait;
+template <class Trait, class... Traits>
+struct AnalyzeExecPolicyUseMatcher<void, type_list<>, Trait, Traits...> {
+  static constexpr auto trigger_error_message =
+      show_name_of_invalid_execution_policy_trait<Trait>{};
+  static_assert(
+      /* always false: */ std::is_void<Trait>::value,
+      "Unknown execution policy trait. Search compiler output for "
+      "'show_name_of_invalid_execution_policy_trait' to see the type of the "
+      "invalid trait.");
+};
+
+// All traits matched case:
+template <>
+struct AnalyzeExecPolicyUseMatcher<void, type_list<>>
+    : AnalyzeExecPolicy<void> {
+  using base_t = AnalyzeExecPolicy<void>;
+  using base_t::base_t;
+};
+
+// </editor-fold> end AnalyzeExecPolicyUseMatcher }}}1
+//==============================================================================
+
 //------------------------------------------------------------------------------
 // Used for defaults that depend on other analysis results
 template <class AnalysisResults>
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
index dd571eb6d72e23bf0d028493fbeec42f645b382b..d481a8dc0f21efa675e0b181a8c6981e1f9afce6 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
@@ -51,10 +51,6 @@
     !defined(KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP)
 #define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
-#endif
-
 #include <impl/Kokkos_Atomic_Memory_Order.hpp>
 #include <impl/Kokkos_Memory_Fence.hpp>
 
@@ -115,13 +111,9 @@ __inline__ __device__ T atomic_compare_exchange(
                             const T>::type& val) {
   T return_val;
   // This is a way to (hopefully) avoid dead lock in a warp
-  int done = 0;
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  unsigned int mask   = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+  int done                 = 0;
+  unsigned int mask        = __activemask();
+  unsigned int active      = __ballot_sync(mask, 1);
   unsigned int done_active = 0;
   while (active != done_active) {
     if (!done) {
@@ -134,11 +126,7 @@ __inline__ __device__ T atomic_compare_exchange(
         done = 1;
       }
     }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done);
-#else
-    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
-#endif
+    done_active = __ballot_sync(mask, done);
   }
   return return_val;
 }
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp
index bbea3c99b8fcfbf5c25132c913f92b7337001806..4bb8b4fd52af0c8beaf8c4dfadddfa7be58c5c54 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp
@@ -51,10 +51,6 @@
 #ifndef KOKKOS_ATOMIC_COMPARE_EXCHANGE_WEAK_HPP
 #define KOKKOS_ATOMIC_COMPARE_EXCHANGE_WEAK_HPP
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
-#endif
-
 namespace Kokkos {
 
 //----------------------------------------------------------------------------
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
index f2c1c756a910d26de0eb3765e0b90684e564d243..cd840983d8a3bd24cace6e411cabc940d44ddfe1 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
@@ -50,10 +50,6 @@
 #if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_EXCHANGE_HPP)
 #define KOKKOS_ATOMIC_EXCHANGE_HPP
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
-#endif
-
 namespace Kokkos {
 
 //----------------------------------------------------------------------------
@@ -122,13 +118,9 @@ atomic_exchange(volatile T* const dest,
   _mm_prefetch((const char*)dest, _MM_HINT_ET0);
 #endif
 
-  int done = 0;
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  unsigned int mask   = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+  int done                 = 0;
+  unsigned int mask        = __activemask();
+  unsigned int active      = __ballot_sync(mask, 1);
   unsigned int done_active = 0;
   while (active != done_active) {
     if (!done) {
@@ -141,11 +133,7 @@ atomic_exchange(volatile T* const dest,
         done = 1;
       }
     }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done);
-#else
-    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
-#endif
+    done_active = __ballot_sync(mask, done);
   }
   return return_val;
 }
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
index 5c3f825ed100450bac57110829f64094b782011d..9a2b13debc70f24bf6adb34ddee13815458245b3 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
@@ -50,10 +50,6 @@
 #if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_FETCH_ADD_HPP)
 #define KOKKOS_ATOMIC_FETCH_ADD_HPP
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
-#endif
-
 namespace Kokkos {
 
 //----------------------------------------------------------------------------
@@ -148,13 +144,9 @@ atomic_fetch_add(volatile T* const dest,
                                          const T>::type& val) {
   T return_val;
   // This is a way to (hopefully) avoid dead lock in a warp
-  int done = 0;
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  unsigned int mask   = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+  int done                 = 0;
+  unsigned int mask        = __activemask();
+  unsigned int active      = __ballot_sync(mask, 1);
   unsigned int done_active = 0;
   while (active != done_active) {
     if (!done) {
@@ -169,11 +161,7 @@ atomic_fetch_add(volatile T* const dest,
       }
     }
 
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done);
-#else
-    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
-#endif
+    done_active = __ballot_sync(mask, done);
   }
   return return_val;
 }
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
index c3446ae6a3bda89fac094ab9693688c2be9f77a5..148ed974420ff88d2831a2ac98ac70a8ea5f4bf2 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
@@ -50,10 +50,6 @@
 #if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_FETCH_SUB_HPP)
 #define KOKKOS_ATOMIC_FETCH_SUB_HPP
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
-#endif
-
 namespace Kokkos {
 
 //----------------------------------------------------------------------------
@@ -143,13 +139,9 @@ atomic_fetch_sub(volatile T* const dest,
                                          const T>::type& val) {
   T return_val;
   // This is a way to (hopefully) avoid dead lock in a warp
-  int done = 0;
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  unsigned int mask   = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+  int done                 = 0;
+  unsigned int mask        = __activemask();
+  unsigned int active      = __ballot_sync(mask, 1);
   unsigned int done_active = 0;
   while (active != done_active) {
     if (!done) {
@@ -162,11 +154,7 @@ atomic_fetch_sub(volatile T* const dest,
         done = 1;
       }
     }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done);
-#else
-    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
-#endif
+    done_active = __ballot_sync(mask, done);
   }
   return return_val;
 }
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
index 28ac7a3bab9e748f9d315ca479f57db885ed75c4..f6bdbca729a335e4218ec3ac9108f0c3046eac05 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
@@ -47,10 +47,6 @@
 #define KOKKOS_ATOMIC_GENERIC_HPP
 #include <Kokkos_Macros.hpp>
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
-#endif
-
 // Combination operands to be used in an Compare and Exchange based atomic
 // operation
 namespace Kokkos {
@@ -301,12 +297,8 @@ KOKKOS_INLINE_FUNCTION T atomic_fetch_oper(
   // This is a way to (hopefully) avoid dead lock in a warp
   T return_val;
   int done                 = 0;
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  unsigned int mask        = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-  unsigned int active      = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+  unsigned int mask        = __activemask();
+  unsigned int active      = __ballot_sync(mask, 1);
   unsigned int done_active = 0;
   while (active != done_active) {
     if (!done) {
@@ -319,11 +311,7 @@ KOKKOS_INLINE_FUNCTION T atomic_fetch_oper(
         done = 1;
       }
     }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done);
-#else
-    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
-#endif
+    done_active = __ballot_sync(mask, done);
   }
   return return_val;
 #elif defined(__HIP_DEVICE_COMPILE__)
@@ -377,12 +365,8 @@ atomic_oper_fetch(const Oper& op, volatile T* const dest,
   T return_val;
   // This is a way to (hopefully) avoid dead lock in a warp
   int done                 = 0;
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  unsigned int mask        = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-  unsigned int active      = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+  unsigned int mask        = __activemask();
+  unsigned int active      = __ballot_sync(mask, 1);
   unsigned int done_active = 0;
   while (active != done_active) {
     if (!done) {
@@ -395,11 +379,7 @@ atomic_oper_fetch(const Oper& op, volatile T* const dest,
         done = 1;
       }
     }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done);
-#else
-    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
-#endif
+    done_active = __ballot_sync(mask, done);
   }
   return return_val;
 #elif defined(__HIP_DEVICE_COMPILE__)
diff --git a/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp b/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
index 975318b7dde67a1d1569c3cf657060c3ae18215d..f763f8c7916e4875e6c1b2a6c3733c89c532f6bb 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
@@ -339,9 +339,8 @@ class AtomicDataElement {
   }
 
   KOKKOS_INLINE_FUNCTION
-  operator volatile non_const_value_type() volatile const {
-    // return Kokkos::atomic_load(ptr);
-    return *ptr;
+  operator non_const_value_type() volatile const {
+    return Kokkos::Impl::atomic_load(ptr);
   }
 };
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp b/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp
index 4e46b8d157f83129182d4db9b725bcddbe3ed28b..87f18604da52a62c6c8de22e8c670169ceec643a 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ClockTic.hpp
@@ -55,7 +55,7 @@
 // To use OpenCL(TM) built-in intrinsics inside kernels, we have to
 // forward-declare their prototype, also see
 // https://github.com/intel/pti-gpu/blob/master/chapters/binary_instrumentation/OpenCLBuiltIn.md
-#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GEN) && \
+#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) && \
     defined(__SYCL_DEVICE_ONLY__)
 extern SYCL_EXTERNAL unsigned long __attribute__((overloadable))
 intel_get_cycle_counter();
@@ -85,7 +85,7 @@ uint64_t clock_tic() noexcept {
 
   return clock64();
 
-#elif defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GEN) && \
+#elif defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) && \
     defined(__SYCL_DEVICE_ONLY__)
   return intel_get_cycle_counter();
 #elif defined(KOKKOS_ENABLE_OPENMPTARGET)
diff --git a/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp b/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp
index 06681a95ae902c613c701cd78ff572d35da6c0a1..4ec8513191f21e07896bac21274e3af088dfe518 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp
@@ -76,17 +76,17 @@ struct CombinedReducerValueItemImpl {
       CombinedReducerValueItemImpl const&) = default;
   KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerValueItemImpl(
       CombinedReducerValueItemImpl&&) = default;
-  KOKKOS_DEFAULTED_FUNCTION KOKKOS_CONSTEXPR_14 CombinedReducerValueItemImpl&
-  operator=(CombinedReducerValueItemImpl const&) = default;
-  KOKKOS_DEFAULTED_FUNCTION KOKKOS_CONSTEXPR_14 CombinedReducerValueItemImpl&
-  operator=(CombinedReducerValueItemImpl&&) = default;
+  KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerValueItemImpl& operator=(
+      CombinedReducerValueItemImpl const&) = default;
+  KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerValueItemImpl& operator=(
+      CombinedReducerValueItemImpl&&) = default;
   KOKKOS_DEFAULTED_FUNCTION
   ~CombinedReducerValueItemImpl() = default;
   explicit KOKKOS_FUNCTION CombinedReducerValueItemImpl(value_type arg_value)
       : m_value(std::move(arg_value)) {}
 
   KOKKOS_FORCEINLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14 value_type& ref() & noexcept { return m_value; }
+  constexpr value_type& ref() & noexcept { return m_value; }
   KOKKOS_FORCEINLINE_FUNCTION
   constexpr value_type const& ref() const& noexcept { return m_value; }
   KOKKOS_FORCEINLINE_FUNCTION
@@ -112,11 +112,11 @@ struct CombinedReducerValueImpl<std::integer_sequence<size_t, Idxs...>,
   KOKKOS_DEFAULTED_FUNCTION
   constexpr CombinedReducerValueImpl(CombinedReducerValueImpl&&) = default;
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14 CombinedReducerValueImpl& operator=(
+  constexpr CombinedReducerValueImpl& operator=(
       CombinedReducerValueImpl const&) = default;
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14 CombinedReducerValueImpl& operator=(
-      CombinedReducerValueImpl&&) = default;
+  constexpr CombinedReducerValueImpl& operator=(CombinedReducerValueImpl&&) =
+      default;
   KOKKOS_DEFAULTED_FUNCTION
   ~CombinedReducerValueImpl() = default;
 
@@ -165,20 +165,19 @@ struct CombinedReducerStorageImpl {
   // model Reducer
 
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14 _fold_comma_emulation_return
-  _init(value_type& val) const {
+  constexpr _fold_comma_emulation_return _init(value_type& val) const {
     m_reducer.init(val);
     return _fold_comma_emulation_return{};
   }
 
-  KOKKOS_INLINE_FUNCTION KOKKOS_CONSTEXPR_14 _fold_comma_emulation_return
-  _join(value_type& dest, value_type const& src) const {
+  KOKKOS_INLINE_FUNCTION constexpr _fold_comma_emulation_return _join(
+      value_type& dest, value_type const& src) const {
     m_reducer.join(dest, src);
     return _fold_comma_emulation_return{};
   }
 
-  KOKKOS_INLINE_FUNCTION KOKKOS_CONSTEXPR_14 _fold_comma_emulation_return
-  _join(value_type volatile& dest, value_type const volatile& src) const {
+  KOKKOS_INLINE_FUNCTION constexpr _fold_comma_emulation_return _join(
+      value_type volatile& dest, value_type const volatile& src) const {
     m_reducer.join(dest, src);
     return _fold_comma_emulation_return{};
   }
@@ -242,10 +241,10 @@ struct CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, Space,
   KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl(
       CombinedReducerImpl const&) = default;
   KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl(
-      CombinedReducerImpl&&) = default;
-  KOKKOS_DEFAULTED_FUNCTION KOKKOS_CONSTEXPR_14 CombinedReducerImpl& operator=(
+      CombinedReducerImpl&&)                                       = default;
+  KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl& operator=(
       CombinedReducerImpl const&) = default;
-  KOKKOS_DEFAULTED_FUNCTION KOKKOS_CONSTEXPR_14 CombinedReducerImpl& operator=(
+  KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl& operator=(
       CombinedReducerImpl&&) = default;
 
   KOKKOS_DEFAULTED_FUNCTION ~CombinedReducerImpl() = default;
@@ -257,9 +256,8 @@ struct CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, Space,
                                                        reducers)...,
         m_value_view(&value) {}
 
-  KOKKOS_FUNCTION KOKKOS_CONSTEXPR_14 void join(value_type& dest,
-                                                value_type const& src) const
-      noexcept {
+  KOKKOS_FUNCTION constexpr void join(value_type& dest,
+                                      value_type const& src) const noexcept {
     emulate_fold_comma_operator(
         this->CombinedReducerStorageImpl<Idxs, Reducers>::_join(
             dest.template get<Idxs, typename Reducers::value_type>(),
@@ -274,8 +272,7 @@ struct CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, Space,
             src.template get<Idxs, typename Reducers::value_type>())...);
   }
 
-  KOKKOS_FUNCTION KOKKOS_CONSTEXPR_14 void init(value_type& dest) const
-      noexcept {
+  KOKKOS_FUNCTION constexpr void init(value_type& dest) const noexcept {
     emulate_fold_comma_operator(
         this->CombinedReducerStorageImpl<Idxs, Reducers>::_init(
             dest.template get<Idxs, typename Reducers::value_type>())...);
@@ -298,7 +295,7 @@ struct CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, Space,
   }
 
   KOKKOS_FUNCTION
-  KOKKOS_CONSTEXPR_14 static void write_value_back_to_original_references(
+  constexpr static void write_value_back_to_original_references(
       value_type const& value,
       Reducers const&... reducers_that_reference_original_values) noexcept {
     emulate_fold_comma_operator(
@@ -360,10 +357,10 @@ struct CombinedReductionFunctorWrapperImpl<
   constexpr CombinedReductionFunctorWrapperImpl(
       CombinedReductionFunctorWrapperImpl&&) = default;
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14 CombinedReductionFunctorWrapperImpl& operator=(
+  constexpr CombinedReductionFunctorWrapperImpl& operator=(
       CombinedReductionFunctorWrapperImpl const&) = default;
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14 CombinedReductionFunctorWrapperImpl& operator=(
+  constexpr CombinedReductionFunctorWrapperImpl& operator=(
       CombinedReductionFunctorWrapperImpl&&) = default;
   KOKKOS_DEFAULTED_FUNCTION
   ~CombinedReductionFunctorWrapperImpl() = default;
@@ -551,7 +548,7 @@ auto parallel_reduce(std::string const& label, PolicyType const& policy,
                      ReturnType2&& returnType2,
                      ReturnTypes&&... returnTypes) noexcept ->
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<PolicyType>::value>::type {
+        Kokkos::is_execution_policy<PolicyType>::value>::type {
   //----------------------------------------
   // Since we don't support asynchronous combined reducers yet for various
   // reasons, we actually just want to work with the pointers and references
@@ -581,8 +578,11 @@ auto parallel_reduce(std::string const& label, PolicyType const& policy,
 
   reduce_adaptor_t::execute(label, policy, combined_functor, combined_reducer);
   Impl::ParallelReduceFence<typename PolicyType::execution_space,
-                            combined_reducer_type>::fence(policy.space(),
-                                                          combined_reducer);
+                            combined_reducer_type>::
+      fence(
+          policy.space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          combined_reducer);
   combined_reducer.write_value_back_to_original_references(
       value, Impl::_make_reducer_from_arg<space_type>(returnType1),
       Impl::_make_reducer_from_arg<space_type>(returnType2),
@@ -596,7 +596,7 @@ auto parallel_reduce(PolicyType const& policy, Functor const& functor,
                      ReturnType1&& returnType1, ReturnType2&& returnType2,
                      ReturnTypes&&... returnTypes) noexcept ->
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<PolicyType>::value>::type {
+        Kokkos::is_execution_policy<PolicyType>::value>::type {
   //----------------------------------------
   Kokkos::parallel_reduce("", policy, functor,
                           std::forward<ReturnType1>(returnType1),
diff --git a/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp b/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp
index c02f4acddacb41f5fb01c50536f6a426738fac99..dafe57f8da71cd22ea09a4e93a84f3196b24ca5c 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp
@@ -138,15 +138,15 @@ struct concurrent_bitset {
     // when is full at the atomic_fetch_add(+1)
     // then a release occurs before the atomic_fetch_add(-1).
 
-    const uint32_t state =
-        (uint32_t)Kokkos::atomic_fetch_add((volatile int *)buffer, 1);
+    const uint32_t state = (uint32_t)Kokkos::atomic_fetch_add(
+        reinterpret_cast<volatile int *>(buffer), 1);
 
     const uint32_t state_error = state_header != (state & state_header_mask);
 
     const uint32_t state_bit_used = state & state_used_mask;
 
     if (state_error || (bit_bound <= state_bit_used)) {
-      Kokkos::atomic_fetch_add((volatile int *)buffer, -1);
+      Kokkos::atomic_fetch_add(reinterpret_cast<volatile int *>(buffer), -1);
       return state_error ? type(-2, -2) : type(-1, -1);
     }
 
@@ -222,15 +222,15 @@ struct concurrent_bitset {
     // when is full at the atomic_fetch_add(+1)
     // then a release occurs before the atomic_fetch_add(-1).
 
-    const uint32_t state =
-        (uint32_t)Kokkos::atomic_fetch_add((volatile int *)buffer, 1);
+    const uint32_t state = (uint32_t)Kokkos::atomic_fetch_add(
+        reinterpret_cast<volatile int *>(buffer), 1);
 
     const uint32_t state_error = state_header != (state & state_header_mask);
 
     const uint32_t state_bit_used = state & state_used_mask;
 
     if (state_error || (bit_bound <= state_bit_used)) {
-      Kokkos::atomic_fetch_add((volatile int *)buffer, -1);
+      Kokkos::atomic_fetch_add(reinterpret_cast<volatile int *>(buffer), -1);
       return state_error ? type(-2, -2) : type(-1, -1);
     }
 
@@ -300,7 +300,8 @@ struct concurrent_bitset {
     // Do not update count until bit clear is visible
     Kokkos::memory_fence();
 
-    const int count = Kokkos::atomic_fetch_add((volatile int *)buffer, -1);
+    const int count =
+        Kokkos::atomic_fetch_add(reinterpret_cast<volatile int *>(buffer), -1);
 
     // Flush the store-release
     Kokkos::memory_fence();
@@ -336,7 +337,8 @@ struct concurrent_bitset {
     // Do not update count until bit clear is visible
     Kokkos::memory_fence();
 
-    const int count = Kokkos::atomic_fetch_add((volatile int *)buffer, -1);
+    const int count =
+        Kokkos::atomic_fetch_add(reinterpret_cast<volatile int *>(buffer), -1);
 
     return (count & state_used_mask) - 1;
   }
diff --git a/packages/kokkos/core/src/impl/Kokkos_Core.cpp b/packages/kokkos/core/src/impl/Kokkos_Core.cpp
index b4769fbeaa53be8353df315ede634708da1b297d..a1f9d336329fff0426863f28f493854e7f8091f3 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Core.cpp
@@ -130,6 +130,11 @@ void ExecSpaceManager::static_fence() {
     to_fence.second->fence();
   }
 }
+void ExecSpaceManager::static_fence(const std::string& name) {
+  for (auto& to_fence : exec_space_factory_list) {
+    to_fence.second->fence(name);
+  }
+}
 void ExecSpaceManager::print_configuration(std::ostream& msg,
                                            const bool detail) {
   for (auto& to_print : exec_space_factory_list) {
@@ -506,11 +511,6 @@ void pre_initialize_internal(const InitArguments& args) {
   declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "yes");
 #else
   declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "no");
-#endif
-#ifdef KOKKOS_ENABLE_MPI
-  declare_configuration_metadata("options", "KOKKOS_ENABLE_MPI", "yes");
-#else
-  declare_configuration_metadata("options", "KOKKOS_ENABLE_MPI", "no");
 #endif
   declare_configuration_metadata("architecture", "Default Device",
                                  typeid(Kokkos::DefaultExecutionSpace).name());
@@ -564,7 +564,9 @@ void finalize_internal(const bool all_spaces = false) {
   g_tune_internals = false;
 }
 
-void fence_internal() { Impl::ExecSpaceManager::get_instance().static_fence(); }
+void fence_internal(const std::string& name) {
+  Impl::ExecSpaceManager::get_instance().static_fence(name);
+}
 
 bool check_arg(char const* arg, char const* expected) {
   std::size_t arg_len = std::strlen(arg);
@@ -1092,7 +1094,8 @@ void finalize_all() {
   Impl::finalize_internal(all_spaces);
 }
 
-void fence() { Impl::fence_internal(); }
+void fence() { Impl::fence_internal("Kokkos::fence: Unnamed Global Fence"); }
+void fence(const std::string& name) { Impl::fence_internal(name); }
 
 void print_helper(std::ostringstream& out,
                   const std::map<std::string, std::string>& print_me) {
diff --git a/packages/kokkos/core/src/impl/Kokkos_EBO.hpp b/packages/kokkos/core/src/impl/Kokkos_EBO.hpp
index a124511c07e2fcb50c8392e56f7c5393262a3af7..dc8e5e4d830623b4fa794966da4b724467e181dc 100644
--- a/packages/kokkos/core/src/impl/Kokkos_EBO.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_EBO.hpp
@@ -79,20 +79,6 @@ struct EBOBaseImpl;
 
 template <class T, template <class...> class CtorNotOnDevice>
 struct EBOBaseImpl<T, true, CtorNotOnDevice> {
-  /*
-   * Workaround for constexpr in C++11: we need to still call T(args...), but we
-   * can't do so in the body of a constexpr function (in C++11), and there's no
-   * data member to construct into. But we can construct into an argument
-   * of a delegating constructor...
-   */
-  // TODO @minor DSH the destructor gets called too early with this workaround
-  struct _constexpr_14_workaround_tag {};
-  struct _constexpr_14_workaround_no_device_tag {};
-  KOKKOS_FORCEINLINE_FUNCTION
-  constexpr EBOBaseImpl(_constexpr_14_workaround_tag, T&&) noexcept {}
-  inline constexpr EBOBaseImpl(_constexpr_14_workaround_no_device_tag,
-                               T&&) noexcept {}
-
   template <
       class... Args, class _ignored = void,
       typename std::enable_if<std::is_void<_ignored>::value &&
@@ -100,10 +86,7 @@ struct EBOBaseImpl<T, true, CtorNotOnDevice> {
                                   !CtorNotOnDevice<Args...>::value,
                               int>::type = 0>
   KOKKOS_FORCEINLINE_FUNCTION constexpr explicit EBOBaseImpl(
-      Args&&... args) noexcept(noexcept(T(std::forward<Args>(args)...)))
-      // still call the constructor
-      : EBOBaseImpl(_constexpr_14_workaround_tag{},
-                    T(std::forward<Args>(args)...)) {}
+      Args&&...) noexcept {}
 
   template <
       class... Args, class _ignored = void,
@@ -111,11 +94,7 @@ struct EBOBaseImpl<T, true, CtorNotOnDevice> {
                                   std::is_constructible<T, Args...>::value &&
                                   CtorNotOnDevice<Args...>::value,
                               long>::type = 0>
-  inline constexpr explicit EBOBaseImpl(Args&&... args) noexcept(
-      noexcept(T(std::forward<Args>(args)...)))
-      // still call the constructor
-      : EBOBaseImpl(_constexpr_14_workaround_no_device_tag{},
-                    T(std::forward<Args>(args)...)) {}
+  inline constexpr explicit EBOBaseImpl(Args&&...) noexcept {}
 
   KOKKOS_DEFAULTED_FUNCTION
   constexpr EBOBaseImpl(EBOBaseImpl const&) = default;
@@ -124,19 +103,16 @@ struct EBOBaseImpl<T, true, CtorNotOnDevice> {
   constexpr EBOBaseImpl(EBOBaseImpl&&) = default;
 
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  EBOBaseImpl& operator=(EBOBaseImpl const&) = default;
+  constexpr EBOBaseImpl& operator=(EBOBaseImpl const&) = default;
 
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  EBOBaseImpl& operator=(EBOBaseImpl&&) = default;
+  constexpr EBOBaseImpl& operator=(EBOBaseImpl&&) = default;
 
   KOKKOS_DEFAULTED_FUNCTION
   ~EBOBaseImpl() = default;
 
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  T& _ebo_data_member() & { return *reinterpret_cast<T*>(this); }
+  constexpr T& _ebo_data_member() & { return *reinterpret_cast<T*>(this); }
 
   KOKKOS_INLINE_FUNCTION
   constexpr T const& _ebo_data_member() const& {
@@ -154,8 +130,9 @@ struct EBOBaseImpl<T, true, CtorNotOnDevice> {
   }
 
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  T&& _ebo_data_member() && { return std::move(*reinterpret_cast<T*>(this)); }
+  constexpr T&& _ebo_data_member() && {
+    return std::move(*reinterpret_cast<T*>(this));
+  }
 };
 
 template <class T, template <class...> class CTorsNotOnDevice>
@@ -191,12 +168,10 @@ struct EBOBaseImpl<T, false, CTorsNotOnDevice> {
   constexpr EBOBaseImpl(EBOBaseImpl&&) noexcept = default;
 
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  EBOBaseImpl& operator=(EBOBaseImpl const&) = default;
+  constexpr EBOBaseImpl& operator=(EBOBaseImpl const&) = default;
 
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  EBOBaseImpl& operator=(EBOBaseImpl&&) = default;
+  constexpr EBOBaseImpl& operator=(EBOBaseImpl&&) = default;
 
   KOKKOS_DEFAULTED_FUNCTION
   ~EBOBaseImpl() = default;
@@ -232,8 +207,7 @@ struct StandardLayoutNoUniqueAddressMemberEmulation
   using ebo_base_t::ebo_base_t;
 
   KOKKOS_FORCEINLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  T& no_unique_address_data_member() & {
+  constexpr T& no_unique_address_data_member() & {
     return this->ebo_base_t::_ebo_data_member();
   }
 
@@ -253,8 +227,7 @@ struct StandardLayoutNoUniqueAddressMemberEmulation
   }
 
   KOKKOS_FORCEINLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  T&& no_unique_address_data_member() && {
+  constexpr T&& no_unique_address_data_member() && {
     return this->ebo_base_t::_ebo_data_member();
   }
 };
diff --git a/packages/kokkos/core/src/impl/Kokkos_Error.cpp b/packages/kokkos/core/src/impl/Kokkos_Error.cpp
index dfb9f3a51cdbd9aa7e189e21f5956806d53823b5..9c8024cbd03ee9230b1ed27468c7cb82aadc5d97 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Error.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Error.cpp
@@ -138,6 +138,9 @@ void Experimental::RawMemoryAllocationFailure::print_error_message(
     case AllocationMechanism::SYCLMallocShared:
       o << "sycl::malloc_shared().";
       break;
+    case AllocationMechanism::SYCLMallocHost:
+      o << "sycl::malloc_host().";
+      break;
   }
   append_additional_error_information(o);
   o << ")" << std::endl;
diff --git a/packages/kokkos/core/src/impl/Kokkos_Error.hpp b/packages/kokkos/core/src/impl/Kokkos_Error.hpp
index 5db459734631ddff5d0a29963a9ec04b9ec549ea..dc9bfe2b5a9e0eb66dc2a6ae43fd296726e7a458 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Error.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Error.hpp
@@ -97,7 +97,8 @@ class RawMemoryAllocationFailure : public std::bad_alloc {
     HIPMalloc,
     HIPHostMalloc,
     SYCLMallocDevice,
-    SYCLMallocShared
+    SYCLMallocShared,
+    SYCLMallocHost
   };
 
  private:
@@ -218,31 +219,41 @@ KOKKOS_IMPL_ABORT_NORETURN KOKKOS_INLINE_FUNCTION void abort(
 
 #if !defined(NDEBUG) || defined(KOKKOS_ENFORCE_CONTRACTS) || \
     defined(KOKKOS_ENABLE_DEBUG)
-#define KOKKOS_EXPECTS(...)                                               \
-  {                                                                       \
-    if (!bool(__VA_ARGS__)) {                                             \
-      ::Kokkos::abort(                                                    \
-          "Kokkos contract violation:\n  "                                \
-          "  Expected precondition `" #__VA_ARGS__ "` evaluated false."); \
-    }                                                                     \
+#define KOKKOS_EXPECTS(...)                                                    \
+  {                                                                            \
+    if (!bool(__VA_ARGS__)) {                                                  \
+      ::Kokkos::abort(                                                         \
+          "Kokkos contract violation:\n  "                                     \
+          "  Expected precondition `" #__VA_ARGS__                             \
+          "` evaluated false.\n"                                               \
+          "Error at " KOKKOS_IMPL_TOSTRING(__FILE__) ":" KOKKOS_IMPL_TOSTRING( \
+              __LINE__) " \n");                                                \
+    }                                                                          \
   }
-#define KOKKOS_ENSURES(...)                                               \
-  {                                                                       \
-    if (!bool(__VA_ARGS__)) {                                             \
-      ::Kokkos::abort(                                                    \
-          "Kokkos contract violation:\n  "                                \
-          "  Ensured postcondition `" #__VA_ARGS__ "` evaluated false."); \
-    }                                                                     \
+#define KOKKOS_ENSURES(...)                                                    \
+  {                                                                            \
+    if (!bool(__VA_ARGS__)) {                                                  \
+      ::Kokkos::abort(                                                         \
+          "Kokkos contract violation:\n  "                                     \
+          "  Ensured postcondition `" #__VA_ARGS__                             \
+          "` evaluated false.\n"                                               \
+          "Error at " KOKKOS_IMPL_TOSTRING(__FILE__) ":" KOKKOS_IMPL_TOSTRING( \
+              __LINE__) " \n");                                                \
+    }                                                                          \
   }
-// some projects already define this for themselves, so don't mess them up
+// some projects already define this for themselves, so don't mess
+// them up
 #ifndef KOKKOS_ASSERT
-#define KOKKOS_ASSERT(...)                                             \
-  {                                                                    \
-    if (!bool(__VA_ARGS__)) {                                          \
-      ::Kokkos::abort(                                                 \
-          "Kokkos contract violation:\n  "                             \
-          "  Asserted condition `" #__VA_ARGS__ "` evaluated false."); \
-    }                                                                  \
+#define KOKKOS_ASSERT(...)                                                     \
+  {                                                                            \
+    if (!bool(__VA_ARGS__)) {                                                  \
+      ::Kokkos::abort(                                                         \
+          "Kokkos contract violation:\n  "                                     \
+          "  Asserted condition `" #__VA_ARGS__                                \
+          "` evaluated false.\n"                                               \
+          "Error at " KOKKOS_IMPL_TOSTRING(__FILE__) ":" KOKKOS_IMPL_TOSTRING( \
+              __LINE__) " \n");                                                \
+    }                                                                          \
   }
 #endif  // ifndef KOKKOS_ASSERT
 #else   // not debug mode
diff --git a/packages/kokkos/core/src/impl/Kokkos_ExecSpaceInitializer.hpp b/packages/kokkos/core/src/impl/Kokkos_ExecSpaceInitializer.hpp
index a922e7e3f9b19d0413487674f847b539e9d4f10a..1a0b10e40fe5e280746c3c0443202a4413585a0c 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ExecSpaceInitializer.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ExecSpaceInitializer.hpp
@@ -55,6 +55,7 @@ class ExecSpaceInitializerBase {
   virtual void initialize(const InitArguments &args)                     = 0;
   virtual void finalize(const bool all_spaces)                           = 0;
   virtual void fence()                                                   = 0;
+  virtual void fence(const std::string &)                                = 0;
   virtual void print_configuration(std::ostream &msg, const bool detail) = 0;
   ExecSpaceInitializerBase()          = default;
   virtual ~ExecSpaceInitializerBase() = default;
diff --git a/packages/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp b/packages/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
index 22e88ebc4fc57d4e7132bca0be2aa55f5bfc5f69..5de92fc45741234aafaa97fb0c31dc11aa9d9c10 100644
--- a/packages/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
@@ -48,7 +48,6 @@
 #include <cstddef>
 #include <Kokkos_Core_fwd.hpp>
 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -1335,7 +1334,10 @@ struct FunctorValueTraits<FunctorType, ArgTag,
   using functor_type = FunctorType;
 
   static_assert(
-      IS_VOID || IS_REJECT || 0 == (sizeof(ValueType) % sizeof(int)),
+      IS_VOID || IS_REJECT ||
+          ((sizeof(ValueType) > sizeof(int))
+               ? 0 == sizeof(ValueType) % sizeof(int)
+               : true),
       "Reduction functor's value_type deduced from functor::operator() "
       "requires: 0 == sizeof(value_type) % sizeof(int)");
 
@@ -1902,17 +1904,6 @@ struct FunctorFinalFunction {
   KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
                                                         value_type&));
 
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // , value_type volatile & ) const ); KOKKOS_INLINE_FUNCTION static void
-  // enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile & )
-  // const ); KOKKOS_INLINE_FUNCTION static void enable_if( void
-  // (FunctorType::*)( ArgTag         , value_type volatile & ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // const & , value_type volatile & ) ); KOKKOS_INLINE_FUNCTION static void
-  // enable_if( void (             *)( ArgTag         , value_type volatile & )
-  // ); KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)(
-  // ArgTag const & , value_type volatile & ) );
-
   KOKKOS_INLINE_FUNCTION static void enable_if(
       void (FunctorType::*)(ArgTag, value_type const&) const);
   KOKKOS_INLINE_FUNCTION static void enable_if(
@@ -1925,17 +1916,6 @@ struct FunctorFinalFunction {
                                                         value_type const&));
   KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
                                                         value_type const&));
-
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // , value_type const volatile & ) const ); KOKKOS_INLINE_FUNCTION static void
-  // enable_if( void (FunctorType::*)( ArgTag const & , value_type const
-  // volatile & ) const ); KOKKOS_INLINE_FUNCTION static void enable_if( void
-  // (FunctorType::*)( ArgTag         , value_type const volatile & ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // const & , value_type const volatile & ) ); KOKKOS_INLINE_FUNCTION static
-  // void enable_if( void (             *)( ArgTag         , value_type const
-  // volatile & ) ); KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)(
-  // ArgTag const & , value_type const volatile & ) );
 };
 
 // Compatible functions for 'final' function and value_type is an array
@@ -1956,17 +1936,6 @@ struct FunctorFinalFunction<FunctorType, ArgTag, true> {
   KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
                                                         value_type*));
 
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // , value_type volatile * ) const ); KOKKOS_INLINE_FUNCTION static void
-  // enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile * )
-  // const ); KOKKOS_INLINE_FUNCTION static void enable_if( void
-  // (FunctorType::*)( ArgTag         , value_type volatile * ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // const & , value_type volatile * ) ); KOKKOS_INLINE_FUNCTION static void
-  // enable_if( void (             *)( ArgTag         , value_type volatile * )
-  // ); KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)(
-  // ArgTag const & , value_type volatile * ) );
-
   KOKKOS_INLINE_FUNCTION static void enable_if(
       void (FunctorType::*)(ArgTag, value_type const*) const);
   KOKKOS_INLINE_FUNCTION static void enable_if(
@@ -1979,17 +1948,6 @@ struct FunctorFinalFunction<FunctorType, ArgTag, true> {
                                                         value_type const*));
   KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
                                                         value_type const*));
-
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // , value_type const volatile * ) const ); KOKKOS_INLINE_FUNCTION static void
-  // enable_if( void (FunctorType::*)( ArgTag const & , value_type const
-  // volatile * ) const ); KOKKOS_INLINE_FUNCTION static void enable_if( void
-  // (FunctorType::*)( ArgTag         , value_type const volatile * ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // const & , value_type const volatile * ) ); KOKKOS_INLINE_FUNCTION static
-  // void enable_if( void (             *)( ArgTag         , value_type const
-  // volatile * ) ); KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)(
-  // ArgTag const & , value_type const volatile * ) );
 };
 
 template <class FunctorType>
@@ -2109,89 +2067,4 @@ struct FunctorFinal<
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-namespace Kokkos {
-namespace Impl {
-
-template <class FunctorType, class ArgTag,
-          class ReferenceType =
-              typename FunctorValueTraits<FunctorType, ArgTag>::reference_type>
-struct FunctorApplyFunction {
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag, ReferenceType) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag const&, ReferenceType) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag, ReferenceType));
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag const&, ReferenceType));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag, ReferenceType));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
-                                                        ReferenceType));
-};
-
-template <class FunctorType, class ReferenceType>
-struct FunctorApplyFunction<FunctorType, void, ReferenceType> {
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ReferenceType) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ReferenceType));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ReferenceType));
-};
-
-template <class FunctorType>
-struct FunctorApplyFunction<FunctorType, void, void> {
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (FunctorType::*)() const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (FunctorType::*)());
-};
-
-template <class FunctorType, class ArgTag, class ReferenceType,
-          class Enable = void>
-struct FunctorApply {
-  KOKKOS_FORCEINLINE_FUNCTION static void apply(const FunctorType&, void*) {}
-};
-
-/* 'apply' function provided for void value */
-template <class FunctorType, class ArgTag>
-struct FunctorApply<
-    FunctorType, ArgTag,
-    void
-    // First  substitution failure when FunctorType::apply does not exist.
-    // Second substitution failure when enable_if( & Functor::apply ) does not
-    // exist
-    ,
-    decltype(FunctorApplyFunction<FunctorType, ArgTag, void>::enable_if(
-        &FunctorType::apply))> {
-  KOKKOS_FORCEINLINE_FUNCTION static void apply(FunctorType& f) { f.apply(); }
-
-  KOKKOS_FORCEINLINE_FUNCTION static void apply(const FunctorType& f) {
-    f.apply();
-  }
-};
-
-/* 'apply' function provided for single value */
-template <class FunctorType, class ArgTag, class T>
-struct FunctorApply<FunctorType, ArgTag,
-                    T&
-                    // First  substitution failure when FunctorType::apply does
-                    // not exist. Second substitution failure when enable_if( &
-                    // Functor::apply ) does not exist
-                    ,
-                    decltype(
-                        FunctorApplyFunction<FunctorType, ArgTag>::enable_if(
-                            &FunctorType::apply))> {
-  KOKKOS_FORCEINLINE_FUNCTION static void apply(const FunctorType& f, void* p) {
-    f.apply(*((T*)p));
-  }
-
-  KOKKOS_FORCEINLINE_FUNCTION static void apply(FunctorType& f, void* p) {
-    f.apply(*((T*)p));
-  }
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
 #endif /* KOKKOS_FUNCTORADAPTER_HPP */
diff --git a/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp b/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
index a56d19ee722668389b4b43bc377d79bb7fd9799b..7140154e0f6f276dc928dc5f3a73cda97f6e2cec 100644
--- a/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
@@ -48,7 +48,6 @@
 #include <cstddef>
 #include <Kokkos_Core_fwd.hpp>
 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -722,14 +721,16 @@ struct FunctorAnalysis {
 
     template <bool IsArray>
     KOKKOS_INLINE_FUNCTION constexpr
-        typename std::enable_if<IsArray, FunctorAnalysis::ValueType*>::type
+        typename std::enable_if<IsArray,
+                                typename FunctorAnalysis::ValueType*>::type
         ref() const noexcept {
       return m_result;
     }
 
     template <bool IsArray>
     KOKKOS_INLINE_FUNCTION constexpr
-        typename std::enable_if<!IsArray, FunctorAnalysis::ValueType&>::type
+        typename std::enable_if<!IsArray,
+                                typename FunctorAnalysis::ValueType&>::type
         ref() const noexcept {
       return *m_result;
     }
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostSharedPtr.hpp b/packages/kokkos/core/src/impl/Kokkos_HostSharedPtr.hpp
index 97286dd07f4ea2ee94f3070768f425e2ef5b7896..3b7b194db58cb693f69d8a6560896565062b9d99 100644
--- a/packages/kokkos/core/src/impl/Kokkos_HostSharedPtr.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_HostSharedPtr.hpp
@@ -47,6 +47,7 @@
 
 #include <Kokkos_Macros.hpp>
 #include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_Error.hpp>
 
 #include <functional>
 
@@ -92,6 +93,8 @@ class HostSharedPtr {
     // FIXME_OPENMPTARGET requires something like KOKKOS_IMPL_IF_ON_HOST
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     if (m_control) Kokkos::atomic_add(&(m_control->m_counter), 1);
+#else
+    m_control = nullptr;
 #endif
   }
 
@@ -115,6 +118,8 @@ class HostSharedPtr {
       // FIXME_OPENMPTARGET
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
       if (m_control) Kokkos::atomic_add(&(m_control->m_counter), 1);
+#else
+      m_control = nullptr;
 #endif
     }
     return *this;
@@ -154,6 +159,9 @@ class HostSharedPtr {
     // object pointed to by m_counter and m_element_ptr.
     if (m_control) {
       int const count = Kokkos::atomic_fetch_sub(&(m_control->m_counter), 1);
+      // atomic_fetch_sub might have memory order relaxed so we need to force
+      // synchronization to avoid multiple threads doing the cleanup.
+      Kokkos::memory_fence();
       if (count == 1) {
         (m_control->m_deleter)(m_element_ptr);
         m_element_ptr = nullptr;
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
index 2e5587e4a342c8c2b167f307f8c8b3a3215f304a..a7f4a652befb148bafff866d8949edb9f2520eaa 100644
--- a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
@@ -74,8 +74,8 @@ void HostThreadTeamData::organize_pool(HostThreadTeamData *members[],
     }
 
     {
-      HostThreadTeamData **const pool =
-          (HostThreadTeamData **)(root_scratch + m_pool_members);
+      HostThreadTeamData **const pool = reinterpret_cast<HostThreadTeamData **>(
+          root_scratch + m_pool_members);
 
       // team size == 1, league size == pool_size
 
@@ -136,7 +136,8 @@ int HostThreadTeamData::organize_team(const int team_size) {
     if (team_size == 1) return 1;  // Already organized in teams of one
 
     HostThreadTeamData *const *const pool =
-        (HostThreadTeamData **)(m_pool_scratch + m_pool_members);
+        reinterpret_cast<HostThreadTeamData **>(m_pool_scratch +
+                                                m_pool_members);
 
     // "league_size" in this context is the number of concurrent teams
     // that the pool can accommodate.  Excess threads are idle.
@@ -239,7 +240,8 @@ int HostThreadTeamData::get_work_stealing() noexcept {
 
     if (w.first == -1 && m_steal_rank != m_pool_rank) {
       HostThreadTeamData *const *const pool =
-          (HostThreadTeamData **)(m_pool_scratch + m_pool_members);
+          reinterpret_cast<HostThreadTeamData **>(m_pool_scratch +
+                                                  m_pool_members);
 
       // Attempt from beginning failed, try to steal from end of neighbor
 
@@ -287,23 +289,17 @@ int HostThreadTeamData::get_work_stealing() noexcept {
 
     if (1 < m_team_size) {
       // Must share the work index
-      *((int volatile *)team_reduce()) = w.first;
+      *reinterpret_cast<int volatile *>(team_reduce()) = w.first;
 
       team_rendezvous_release();
     }
   } else if (1 < m_team_size) {
-    w.first = *((int volatile *)team_reduce());
+    w.first = *reinterpret_cast<int volatile *>(team_reduce());
   }
 
   // May exit because successfully stole work and w is good.
   // May exit because no work left to steal and w = (-1,-1).
 
-#if 0
-fprintf(stdout,"HostThreadTeamData::get_work_stealing() pool(%d of %d) %d\n"
-       , m_pool_rank , m_pool_size , w.first );
-fflush(stdout);
-#endif
-
   return w.first;
 }
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
index d4cae7f122ed182cf88522d5d60729a0906cce5b..0652b55bb71cfb3923374e774bbb2db2f58ee90d 100644
--- a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
@@ -91,9 +91,18 @@ class HostThreadTeamData {
   //   [ thread_local ]     = [ m_thread_local    .. m_scratch_size )
 
   enum : int { m_pool_members = 0 };
-  enum : int { m_pool_rendezvous = m_pool_members + max_pool_members };
-  enum : int { m_team_rendezvous = m_pool_rendezvous + max_pool_rendezvous };
-  enum : int { m_pool_reduce = m_team_rendezvous + max_team_rendezvous };
+  enum : int {
+    m_pool_rendezvous =
+        static_cast<int>(m_pool_members) + static_cast<int>(max_pool_members)
+  };
+  enum : int {
+    m_team_rendezvous = static_cast<int>(m_pool_rendezvous) +
+                        static_cast<int>(max_pool_rendezvous)
+  };
+  enum : int {
+    m_pool_reduce = static_cast<int>(m_team_rendezvous) +
+                    static_cast<int>(max_team_rendezvous)
+  };
 
   using pair_int_t = Kokkos::pair<int64_t, int64_t>;
 
@@ -120,13 +129,13 @@ class HostThreadTeamData {
   int mutable m_team_rendezvous_step;
 
   HostThreadTeamData* team_member(int r) const noexcept {
-    return ((HostThreadTeamData**)(m_pool_scratch +
-                                   m_pool_members))[m_team_base + r];
+    return (reinterpret_cast<HostThreadTeamData**>(
+        m_pool_scratch + m_pool_members))[m_team_base + r];
   }
 
  public:
   inline bool team_rendezvous() const noexcept {
-    int* ptr = (int*)(m_team_scratch + m_team_rendezvous);
+    int* ptr = reinterpret_cast<int*>(m_team_scratch + m_team_rendezvous);
     HostBarrier::split_arrive(ptr, m_team_size, m_team_rendezvous_step);
     if (m_team_rank != 0) {
       HostBarrier::wait(ptr, m_team_size, m_team_rendezvous_step);
@@ -138,7 +147,7 @@ class HostThreadTeamData {
   }
 
   inline bool team_rendezvous(const int source_team_rank) const noexcept {
-    int* ptr = (int*)(m_team_scratch + m_team_rendezvous);
+    int* ptr = reinterpret_cast<int*>(m_team_scratch + m_team_rendezvous);
     HostBarrier::split_arrive(ptr, m_team_size, m_team_rendezvous_step);
     if (m_team_rank != source_team_rank) {
       HostBarrier::wait(ptr, m_team_size, m_team_rendezvous_step);
@@ -150,12 +159,13 @@ class HostThreadTeamData {
   }
 
   inline void team_rendezvous_release() const noexcept {
-    HostBarrier::split_release((int*)(m_team_scratch + m_team_rendezvous),
-                               m_team_size, m_team_rendezvous_step);
+    HostBarrier::split_release(
+        reinterpret_cast<int*>(m_team_scratch + m_team_rendezvous), m_team_size,
+        m_team_rendezvous_step);
   }
 
   inline int pool_rendezvous() const noexcept {
-    int* ptr = (int*)(m_pool_scratch + m_pool_rendezvous);
+    int* ptr = reinterpret_cast<int*>(m_pool_scratch + m_pool_rendezvous);
     HostBarrier::split_arrive(ptr, m_pool_size, m_pool_rendezvous_step);
     if (m_pool_rank != 0) {
       HostBarrier::wait(ptr, m_pool_size, m_pool_rendezvous_step);
@@ -167,8 +177,9 @@ class HostThreadTeamData {
   }
 
   inline void pool_rendezvous_release() const noexcept {
-    HostBarrier::split_release((int*)(m_pool_scratch + m_pool_rendezvous),
-                               m_pool_size, m_pool_rendezvous_step);
+    HostBarrier::split_release(
+        reinterpret_cast<int*>(m_pool_scratch + m_pool_rendezvous), m_pool_size,
+        m_pool_rendezvous_step);
   }
 
   //----------------------------------------
@@ -230,7 +241,8 @@ class HostThreadTeamData {
   constexpr int pool_size() const { return m_pool_size; }
 
   HostThreadTeamData* pool_member(int r) const noexcept {
-    return ((HostThreadTeamData**)(m_pool_scratch + m_pool_members))[r];
+    return (reinterpret_cast<HostThreadTeamData**>(m_pool_scratch +
+                                                   m_pool_members))[r];
   }
 
   //----------------------------------------
@@ -330,24 +342,11 @@ class HostThreadTeamData {
     team_shared_size = align_to_int64(team_shared_size);
     // thread_local_size = align_to_int64( thread_local_size );
 
-    m_scratch      = (int64_t*)alloc_ptr;
+    m_scratch      = static_cast<int64_t*>(alloc_ptr);
     m_team_reduce  = m_pool_reduce + pool_reduce_size;
     m_team_shared  = m_team_reduce + team_reduce_size;
     m_thread_local = m_team_shared + team_shared_size;
     m_scratch_size = align_to_int64(alloc_size);
-
-#if 0
-fprintf(stdout,"HostThreadTeamData::scratch_assign { %d %d %d %d %d %d %d }\n"
-       , int(m_pool_members)
-       , int(m_pool_rendezvous)
-       , int(m_pool_reduce)
-       , int(m_team_reduce)
-       , int(m_team_shared)
-       , int(m_thread_local)
-       , int(m_scratch_size)
-       );
-fflush(stdout);
-#endif
   }
 
   //----------------------------------------
diff --git a/packages/kokkos/core/src/impl/Kokkos_LinkedListNode.hpp b/packages/kokkos/core/src/impl/Kokkos_LinkedListNode.hpp
index 79aeca5da0691264c4cb215f62e17bdb8dbe95e1..1ed502db5be61a2501b57f08969ff47ce12bdbe5 100644
--- a/packages/kokkos/core/src/impl/Kokkos_LinkedListNode.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_LinkedListNode.hpp
@@ -110,7 +110,7 @@ struct SimpleSinglyLinkedListNode {
   friend struct LinkedListNodeAccess;
 
  public:
-  // KOKKOS_CONSTEXPR_14
+  // constexpr
   KOKKOS_INLINE_FUNCTION
   bool is_enqueued() const noexcept {
     // TODO @tasking @memory_order DSH make this an atomic load with memory
@@ -118,7 +118,7 @@ struct SimpleSinglyLinkedListNode {
     return m_next != reinterpret_cast<pointer_type>(NotEnqueuedValue);
   }
 
-  // KOKKOS_CONSTEXPR_14
+  // constexpr
   KOKKOS_INLINE_FUNCTION
   bool is_enqueued() const volatile noexcept {
     // TODO @tasking @memory_order DSH make this an atomic load with memory
diff --git a/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp b/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
index 76d553601923fd7282132fbff05ce69a4e576e97..865d1c47faacfe3d6b39d4227bb180bf483c89dd 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
@@ -48,7 +48,7 @@
 namespace Kokkos {
 
 //----------------------------------------------------------------------------
-
+#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
 KOKKOS_FORCEINLINE_FUNCTION
 void memory_fence() {
 #if defined(__CUDA_ARCH__)
@@ -75,6 +75,7 @@ void memory_fence() {
 #error "Error: memory_fence() not defined"
 #endif
 }
+#endif
 
 //////////////////////////////////////////////////////
 // store_fence()
diff --git a/packages/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp b/packages/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp
index fe78cfbacc632d353844a5cb17f89a5d5ba067ff..1c61b73f027aaefa4993aaae7beee3ca9af05110 100644
--- a/packages/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp
@@ -58,8 +58,7 @@
 
 #include <impl/Kokkos_TaskQueueMemoryManager.hpp>
 #include <impl/Kokkos_TaskQueueCommon.hpp>
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <impl/Kokkos_Atomic_Increment.hpp>
+#include <Kokkos_Atomic.hpp>
 #include <impl/Kokkos_OptionalRef.hpp>
 #include <impl/Kokkos_LIFO.hpp>
 
@@ -467,7 +466,7 @@ class MultipleTaskQueue final
 
   // TODO @tasking @generalization DSH make this a property-based customization
   // point
-  static /* KOKKOS_CONSTEXPR_14 */ size_t task_queue_allocation_size(
+  static /* constexpr */ size_t task_queue_allocation_size(
       typename base_t::execution_space const& exec_space,
       typename base_t::memory_space const&,
       typename base_t::memory_pool const&) {
diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp b/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp
index 94ea6e1a2b10c33a81e4f2c6b7a932577ce6144b..8505e8f51aec744c00dfe2fef3f29d1eb9cb7306 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Profiling.cpp
@@ -53,6 +53,7 @@
 #include <array>
 #include <cstring>
 #include <iostream>
+#include <memory>
 #include <stack>
 #include <unordered_map>
 #include <unordered_set>
@@ -70,7 +71,9 @@ void tool_invoked_fence(const uint32_t /* devID */) {
    * Eventually we want to support fencing only
    * a given stream/resource
    */
-  Kokkos::fence();
+  Kokkos::fence(
+      "Kokkos::Tools::Experimental::Impl::tool_invoked_fence: Tool Requested "
+      "Fence");
 }
 }  // namespace Impl
 #ifdef KOKKOS_ENABLE_TUNING
@@ -131,7 +134,8 @@ inline void invoke_kokkosp_callback(
     if (may_require_global_fencing == MayRequireGlobalFencing::Yes &&
         (Kokkos::Tools::Experimental::tool_requirements
              .requires_global_fencing)) {
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Tools::invoke_kokkosp_callback: Kokkos Profile Tool Fence");
     }
     (*callback)(std::forward<Args>(args)...);
   }
@@ -432,18 +436,43 @@ void initialize(const std::string& profileLibrary) {
   if (is_initialized) return;
   is_initialized = 1;
 
+  auto invoke_init_callbacks = []() {
+    Experimental::invoke_kokkosp_callback(
+        Kokkos::Tools::Experimental::MayRequireGlobalFencing::No,
+        Kokkos::Tools::Experimental::current_callbacks.init, 0,
+        (uint64_t)KOKKOSP_INTERFACE_VERSION, (uint32_t)0, nullptr);
+
+    Experimental::tool_requirements.requires_global_fencing = true;
+
+    Experimental::invoke_kokkosp_callback(
+        Experimental::MayRequireGlobalFencing::No,
+        Experimental::current_callbacks.request_tool_settings, 1,
+        &Experimental::tool_requirements);
+
+    Experimental::ToolProgrammingInterface actions;
+    actions.fence = &Experimental::Impl::tool_invoked_fence;
+
+    Experimental::invoke_kokkosp_callback(
+        Experimental::MayRequireGlobalFencing::No,
+        Experimental::current_callbacks.provide_tool_programming_interface, 1,
+        actions);
+  };
+
 #ifdef KOKKOS_ENABLE_LIBDL
   void* firstProfileLibrary = nullptr;
 
-  if (profileLibrary.empty()) return;
+  if (profileLibrary.empty()) {
+    invoke_init_callbacks();
+    return;
+  }
 
   char* envProfileLibrary = const_cast<char*>(profileLibrary.c_str());
 
-  char* envProfileCopy =
-      (char*)malloc(sizeof(char) * (strlen(envProfileLibrary) + 1));
-  sprintf(envProfileCopy, "%s", envProfileLibrary);
+  const auto envProfileCopy =
+      std::make_unique<char[]>(strlen(envProfileLibrary) + 1);
+  sprintf(envProfileCopy.get(), "%s", envProfileLibrary);
 
-  char* profileLibraryName = strtok(envProfileCopy, ";");
+  char* profileLibraryName = strtok(envProfileCopy.get(), ";");
 
   if ((profileLibraryName != nullptr) &&
       (strcmp(profileLibraryName, "") != 0)) {
@@ -574,25 +603,8 @@ void initialize(const std::string& profileLibrary) {
 #else
   (void)profileLibrary;
 #endif  // KOKKOS_ENABLE_LIBDL
-  Experimental::invoke_kokkosp_callback(
-      Kokkos::Tools::Experimental::MayRequireGlobalFencing::No,
-      Kokkos::Tools::Experimental::current_callbacks.init, 0,
-      (uint64_t)KOKKOSP_INTERFACE_VERSION, (uint32_t)0, nullptr);
-
-  Experimental::tool_requirements.requires_global_fencing = true;
-
-  Experimental::invoke_kokkosp_callback(
-      Experimental::MayRequireGlobalFencing::No,
-      Experimental::current_callbacks.request_tool_settings, 1,
-      &Experimental::tool_requirements);
 
-  Experimental::ToolProgrammingInterface actions;
-  actions.fence = &Experimental::Impl::tool_invoked_fence;
-
-  Experimental::invoke_kokkosp_callback(
-      Experimental::MayRequireGlobalFencing::No,
-      Experimental::current_callbacks.provide_tool_programming_interface, 1,
-      actions);
+  invoke_init_callbacks();
 
 #ifdef KOKKOS_ENABLE_TUNING
   Experimental::VariableInfo kernel_name;
@@ -656,9 +668,6 @@ void initialize(const std::string& profileLibrary) {
   Experimental::no_profiling.declare_output_type   = nullptr;
   Experimental::no_profiling.request_output_values = nullptr;
   Experimental::no_profiling.end_tuning_context    = nullptr;
-#ifdef KOKKOS_ENABLE_LIBDL
-  free(envProfileCopy);
-#endif
 }
 
 void finalize() {
diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling.hpp b/packages/kokkos/core/src/impl/Kokkos_Profiling.hpp
index 1ff6a36c3bc3c934e787af30c5bd6568046f15f1..86a4cfa4a8543e58feab3aee24f1a9ac8530bb5e 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Profiling.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Profiling.hpp
@@ -50,9 +50,12 @@
 #include <Kokkos_Macros.hpp>
 #include <Kokkos_Tuners.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
+#include <memory>
+#include <unordered_map>
 #include <map>
 #include <string>
 #include <type_traits>
+#include <mutex>
 namespace Kokkos {
 
 // forward declaration
@@ -135,6 +138,71 @@ Kokkos_Profiling_SpaceHandle make_space_handle(const char* space_name);
 
 namespace Experimental {
 
+namespace Impl {
+struct DirectFenceIDHandle {
+  uint32_t value;
+};
+//
+template <typename Space>
+uint32_t idForInstance(const uintptr_t instance) {
+  static std::mutex instance_mutex;
+  const std::lock_guard<std::mutex> lock(instance_mutex);
+  /** Needed to be a ptr due to initialization order problems*/
+  using map_type = std::map<uintptr_t, uint32_t>;
+
+  static std::shared_ptr<map_type> map;
+  if (map.get() == nullptr) {
+    map = std::make_shared<map_type>(map_type());
+  }
+
+  static uint32_t value = 0;
+  constexpr const uint32_t offset =
+      Kokkos::Tools::Experimental::NumReservedDeviceIDs;
+
+  auto find = map->find(instance);
+  if (find == map->end()) {
+    auto ret         = offset + value++;
+    (*map)[instance] = ret;
+    return ret;
+  }
+
+  return find->second;
+}
+
+template <typename Space, typename FencingFunctor>
+void profile_fence_event(const std::string& name, DirectFenceIDHandle devIDTag,
+                         const FencingFunctor& func) {
+  uint64_t handle = 0;
+  Kokkos::Tools::beginFence(
+      name,
+      Kokkos::Tools::Experimental::device_id_root<Space>() + devIDTag.value,
+      &handle);
+  func();
+  Kokkos::Tools::endFence(handle);
+}
+
+inline uint32_t int_for_synchronization_reason(
+    Kokkos::Tools::Experimental::SpecialSynchronizationCases reason) {
+  switch (reason) {
+    case GlobalDeviceSynchronization: return 0;
+    case DeepCopyResourceSynchronization: return 0x00ffffff;
+  }
+  return 0;
+}
+
+template <typename Space, typename FencingFunctor>
+void profile_fence_event(
+    const std::string& name,
+    Kokkos::Tools::Experimental::SpecialSynchronizationCases reason,
+    const FencingFunctor& func) {
+  uint64_t handle = 0;
+  Kokkos::Tools::beginFence(
+      name, device_id_root<Space>() + int_for_synchronization_reason(reason),
+      &handle);  // TODO: correct ID
+  func();
+  Kokkos::Tools::endFence(handle);
+}
+}  // namespace Impl
 void set_init_callback(initFunction callback);
 void set_finalize_callback(finalizeFunction callback);
 void set_parse_args_callback(parseArgsFunction callback);
diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h b/packages/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h
index ed8751c50cc04d915b7b3c371a6ec05756ff6087..2c8d1428fc595e0e6724624465ab839f5c80b138 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h
+++ b/packages/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h
@@ -54,7 +54,7 @@
 #include <stdbool.h>
 #endif
 
-#define KOKKOSP_INTERFACE_VERSION 20210225
+#define KOKKOSP_INTERFACE_VERSION 20210623
 
 // Profiling
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp b/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
index 7809632f78ddf33d8429b353723736b68e3b7536..a7aec2e6fd53f6a6b37b011452b58750b092f096 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
@@ -56,6 +56,14 @@
 namespace Kokkos {
 namespace Tools {
 namespace Experimental {
+
+constexpr const uint32_t NumReservedDeviceIDs = 1;
+
+enum SpecialSynchronizationCases : int {
+  GlobalDeviceSynchronization     = 1,
+  DeepCopyResourceSynchronization = 2,
+};
+
 enum struct DeviceType {
   Serial,
   OpenMP,
@@ -68,15 +76,49 @@ enum struct DeviceType {
   Unknown
 };
 
+struct ExecutionSpaceIdentifier {
+  DeviceType type;
+  uint32_t device_id;
+  uint32_t instance_id;
+};
+inline DeviceType devicetype_from_uint32t(const uint32_t in) {
+  switch (in) {
+    case 0: return DeviceType::Serial;
+    case 1: return DeviceType::OpenMP;
+    case 2: return DeviceType::Cuda;
+    case 3: return DeviceType::HIP;
+    case 4: return DeviceType::OpenMPTarget;
+    case 5: return DeviceType::HPX;
+    case 6: return DeviceType::Threads;
+    case 7: return DeviceType::SYCL;
+    default: return DeviceType::Unknown;  // TODO: error out?
+  }
+}
+
+inline ExecutionSpaceIdentifier identifier_from_devid(const uint32_t in) {
+  // ExecutionSpaceIdentifier out;
+  // out.type = in >> 24;
+  // out.device_id = in >> 17;
+  // out.instance_id = ((uint32_t(-1)) << 17 ) & in;
+  return {devicetype_from_uint32t(in >> 24),
+          (~((uint32_t(-1)) << 24)) & (in >> 17),
+          (~((uint32_t(-1)) << 17)) & in};
+}
+
 template <typename ExecutionSpace>
 struct DeviceTypeTraits;
 
 constexpr const size_t device_type_bits = 8;
 constexpr const size_t instance_bits    = 24;
 template <typename ExecutionSpace>
+constexpr uint32_t device_id_root() {
+  constexpr auto device_id =
+      static_cast<uint32_t>(DeviceTypeTraits<ExecutionSpace>::id);
+  return (device_id << instance_bits);
+}
+template <typename ExecutionSpace>
 inline uint32_t device_id(ExecutionSpace const& space) noexcept {
-  auto device_id = static_cast<uint32_t>(DeviceTypeTraits<ExecutionSpace>::id);
-  return (device_id << instance_bits) + space.impl_instance_id();
+  return device_id_root<ExecutionSpace>() + space.impl_instance_id();
 }
 }  // namespace Experimental
 }  // namespace Tools
diff --git a/packages/kokkos/core/src/impl/Kokkos_QuadPrecisionMath.hpp b/packages/kokkos/core/src/impl/Kokkos_QuadPrecisionMath.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b67cede45bfddd657448a054aca3254d627267ce
--- /dev/null
+++ b/packages/kokkos/core/src/impl/Kokkos_QuadPrecisionMath.hpp
@@ -0,0 +1,187 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_QUAD_PRECISION_MATH_HPP
+#define KOKKOS_QUAD_PRECISION_MATH_HPP
+
+#include <Kokkos_Macros.hpp>
+
+#if defined(KOKKOS_ENABLE_LIBQUADMATH)
+
+#include <Kokkos_NumericTraits.hpp>
+
+#include <quadmath.h>
+
+#if !(defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__))
+#error __float128 not supported on this host
+#endif
+
+//<editor-fold desc="numeric traits __float128 specializations">
+namespace Kokkos {
+namespace Experimental {
+#if defined(KOKKOS_ENABLE_CXX17)
+#define KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(TRAIT, TYPE, VALUE_TYPE, VALUE) \
+  template <>                                                                \
+  struct TRAIT<TYPE> {                                                       \
+    static constexpr VALUE_TYPE value = VALUE;                               \
+  };                                                                         \
+  template <>                                                                \
+  inline constexpr auto TRAIT##_v<TYPE> = TRAIT<TYPE>::value;
+#else
+#define KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(TRAIT, TYPE, VALUE_TYPE, VALUE) \
+  template <>                                                                \
+  struct TRAIT<TYPE> {                                                       \
+    static constexpr VALUE_TYPE value = VALUE;                               \
+  };
+#endif
+
+// clang-format off
+// Numeric distinguished value traits
+// Workaround GCC bug https://godbolt.org/z/qWb5oe4dx
+// error: '__builtin_huge_valq()' is not a constant expression
+#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU >= 710)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(infinity,       __float128, __float128, HUGE_VALQ)
+#endif
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(finite_min,     __float128, __float128, -FLT128_MAX)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(finite_max,     __float128, __float128, FLT128_MAX)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(epsilon,        __float128, __float128, FLT128_EPSILON)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(round_error,    __float128, __float128, static_cast<__float128>(0.5))
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(norm_min,       __float128, __float128, FLT128_MIN)
+
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(digits,         __float128,        int, FLT128_MANT_DIG)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(digits10,       __float128,        int, FLT128_DIG)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(max_digits10,   __float128,        int, 36)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(radix,          __float128,        int, 2)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(min_exponent,   __float128,        int, FLT128_MIN_EXP)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(max_exponent,   __float128,        int, FLT128_MAX_EXP)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(min_exponent10, __float128,        int, FLT128_MIN_10_EXP)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(max_exponent10, __float128,        int, FLT128_MAX_10_EXP)
+// clang-format on
+
+#undef KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT
+}  // namespace Experimental
+}  // namespace Kokkos
+//</editor-fold>
+
+namespace Kokkos {
+template <>
+struct reduction_identity<__float128> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static __float128 sum() {
+    return static_cast<__float128>(0.0);
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static __float128 prod() {
+    return static_cast<__float128>(1.0);
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static __float128 max() {
+    return -FLT128_MAX;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static __float128 min() {
+    return FLT128_MAX;
+  }
+};
+}  // namespace Kokkos
+
+//<editor-fold desc="Common mathematical functions __float128 overloads">
+namespace Kokkos {
+namespace Experimental {
+// clang-format off
+// Basic operations
+inline __float128 fabs(__float128 x) { return ::fabsq(x); }
+inline __float128 fmod(__float128 x, __float128 y) { return ::fmodq(x, y); }
+inline __float128 remainder(__float128 x, __float128 y) { return ::remainderq(x, y); }
+inline __float128 fmin(__float128 x, __float128 y) { return ::fminq(x, y); }
+inline __float128 fmax(__float128 x, __float128 y) { return ::fmaxq(x, y); }
+inline __float128 fdim(__float128 x, __float128 y) { return ::fdimq(x, y); }
+inline __float128 nanq(char const* arg) { return ::nanq(arg); }
+// Power functions
+inline __float128 pow(__float128 x, __float128 y) { return ::powq(x, y); }
+inline __float128 sqrt(__float128 x) { return ::sqrtq(x); }
+inline __float128 cbrt(__float128 x) { return ::cbrtq(x); }
+inline __float128 hypot(__float128 x, __float128 y) { return ::hypotq(x, y); }
+// Exponential functions
+inline __float128 exp(__float128 x) { return ::expq(x); }
+#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU >= 910)
+inline __float128 exp2(__float128 x) { return ::exp2q(x); }
+#endif
+inline __float128 expm1(__float128 x) { return ::expm1q(x); }
+inline __float128 log(__float128 x) { return ::logq(x); }
+inline __float128 log10(__float128 x) { return ::log10q(x); }
+inline __float128 log2(__float128 x) { return ::log2q(x); }
+inline __float128 log1p(__float128 x) { return ::log1pq(x); }
+// Trigonometric functions
+inline __float128 sin(__float128 x) { return ::sinq(x); }
+inline __float128 cos(__float128 x) { return ::cosq(x); }
+inline __float128 tan(__float128 x) { return ::tanq(x); }
+inline __float128 asin(__float128 x) { return ::asinq(x); }
+inline __float128 acos(__float128 x) { return ::acosq(x); }
+inline __float128 atan(__float128 x) { return ::atanq(x); }
+inline __float128 atan2(__float128 x, __float128 y) { return ::atan2q(x, y); }
+// Hyperbolic functions
+inline __float128 sinh(__float128 x) { return ::sinhq(x); }
+inline __float128 cosh(__float128 x) { return ::coshq(x); }
+inline __float128 tanh(__float128 x) { return ::tanhq(x); }
+inline __float128 asinh(__float128 x) { return ::asinhq(x); }
+inline __float128 acosh(__float128 x) { return ::acoshq(x); }
+inline __float128 atanh(__float128 x) { return ::atanhq(x); }
+// Error and gamma functions
+inline __float128 erf(__float128 x) { return ::erfq(x); }
+inline __float128 erfc(__float128 x) { return ::erfcq(x); }
+inline __float128 tgamma(__float128 x) { return ::tgammaq(x); }
+inline __float128 lgamma(__float128 x) { return ::lgammaq(x); }
+// Nearest integer floating point operations
+inline __float128 ceil(__float128 x) { return ::ceilq(x); }
+inline __float128 floor(__float128 x) { return ::floorq(x); }
+inline __float128 trunc(__float128 x) { return ::truncq(x); }
+inline __float128 nearbyint(__float128 x) { return ::nearbyintq(x); }
+// Classification and comparison
+inline bool isfinite(__float128 x) { return !::isinfq(x); }  // isfiniteq not provided
+inline bool isinf(__float128 x) { return ::isinfq(x); }
+inline bool isnan(__float128 x) { return ::isnanq(x); }
+}  // namespace Experimental
+}  // namespace Kokkos
+//</editor-fold>
+
+#endif
+
+#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_Serial.cpp b/packages/kokkos/core/src/impl/Kokkos_Serial.cpp
index 4bd037906506bd27654067d5c2fda99fb59684ca..c49e838d8f0b0961b9dfd2bc76c07b5370cb2629 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Serial.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Serial.cpp
@@ -58,28 +58,59 @@
 
 namespace Kokkos {
 namespace Impl {
-namespace {
 
-HostThreadTeamData g_serial_thread_team_data;
+bool SerialInternal::is_initialized() { return m_is_initialized; }
 
-bool g_serial_is_initialized = false;
+void SerialInternal::initialize() {
+  if (is_initialized()) return;
 
-}  // namespace
+  Impl::SharedAllocationRecord<void, void>::tracking_enable();
+
+  // Init the array of locks used for arbitrarily sized atomics
+  Impl::init_lock_array_host_space();
+
+  m_is_initialized = true;
+}
+
+void SerialInternal::finalize() {
+  if (m_thread_team_data.scratch_buffer()) {
+    m_thread_team_data.disband_team();
+    m_thread_team_data.disband_pool();
+
+    Kokkos::HostSpace space;
+
+    space.deallocate(m_thread_team_data.scratch_buffer(),
+                     m_thread_team_data.scratch_bytes());
+
+    m_thread_team_data.scratch_assign(nullptr, 0, 0, 0, 0, 0);
+  }
+
+  Kokkos::Profiling::finalize();
+
+  m_is_initialized = false;
+}
+
+SerialInternal& SerialInternal::singleton() {
+  static SerialInternal* self = nullptr;
+  if (!self) {
+    self = new SerialInternal();
+  }
+  return *self;
+}
 
 // Resize thread team data scratch memory
-void serial_resize_thread_team_data(size_t pool_reduce_bytes,
-                                    size_t team_reduce_bytes,
-                                    size_t team_shared_bytes,
-                                    size_t thread_local_bytes) {
+void SerialInternal::resize_thread_team_data(size_t pool_reduce_bytes,
+                                             size_t team_reduce_bytes,
+                                             size_t team_shared_bytes,
+                                             size_t thread_local_bytes) {
   if (pool_reduce_bytes < 512) pool_reduce_bytes = 512;
   if (team_reduce_bytes < 512) team_reduce_bytes = 512;
 
-  const size_t old_pool_reduce = g_serial_thread_team_data.pool_reduce_bytes();
-  const size_t old_team_reduce = g_serial_thread_team_data.team_reduce_bytes();
-  const size_t old_team_shared = g_serial_thread_team_data.team_shared_bytes();
-  const size_t old_thread_local =
-      g_serial_thread_team_data.thread_local_bytes();
-  const size_t old_alloc_bytes = g_serial_thread_team_data.scratch_bytes();
+  const size_t old_pool_reduce  = m_thread_team_data.pool_reduce_bytes();
+  const size_t old_team_reduce  = m_thread_team_data.team_reduce_bytes();
+  const size_t old_team_shared  = m_thread_team_data.team_shared_bytes();
+  const size_t old_thread_local = m_thread_team_data.thread_local_bytes();
+  const size_t old_alloc_bytes  = m_thread_team_data.scratch_bytes();
 
   // Allocate if any of the old allocation is tool small:
 
@@ -92,12 +123,12 @@ void serial_resize_thread_team_data(size_t pool_reduce_bytes,
     Kokkos::HostSpace space;
 
     if (old_alloc_bytes) {
-      g_serial_thread_team_data.disband_team();
-      g_serial_thread_team_data.disband_pool();
+      m_thread_team_data.disband_team();
+      m_thread_team_data.disband_pool();
 
       space.deallocate("Kokkos::Serial::scratch_mem",
-                       g_serial_thread_team_data.scratch_buffer(),
-                       g_serial_thread_team_data.scratch_bytes());
+                       m_thread_team_data.scratch_buffer(),
+                       m_thread_team_data.scratch_bytes());
     }
 
     if (pool_reduce_bytes < old_pool_reduce) {
@@ -125,56 +156,37 @@ void serial_resize_thread_team_data(size_t pool_reduce_bytes,
       Kokkos::Impl::throw_runtime_exception(failure.get_error_message());
     }
 
-    g_serial_thread_team_data.scratch_assign(
-        ((char*)ptr), alloc_bytes, pool_reduce_bytes, team_reduce_bytes,
-        team_shared_bytes, thread_local_bytes);
+    m_thread_team_data.scratch_assign(static_cast<char*>(ptr), alloc_bytes,
+                                      pool_reduce_bytes, team_reduce_bytes,
+                                      team_shared_bytes, thread_local_bytes);
 
-    HostThreadTeamData* pool[1] = {&g_serial_thread_team_data};
+    HostThreadTeamData* pool[1] = {&m_thread_team_data};
 
-    g_serial_thread_team_data.organize_pool(pool, 1);
-    g_serial_thread_team_data.organize_team(1);
+    m_thread_team_data.organize_pool(pool, 1);
+    m_thread_team_data.organize_team(1);
   }
 }
-
-HostThreadTeamData* serial_get_thread_team_data() {
-  return &g_serial_thread_team_data;
-}
-
 }  // namespace Impl
-}  // namespace Kokkos
 
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
+Serial::Serial()
+#ifdef KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS
+    : m_space_instance(&Impl::SerialInternal::singleton()) {
+}
+#else
+    : m_space_instance(&Impl::SerialInternal::singleton(),
+                       [](Impl::SerialInternal*) {}) {
+}
+#endif
 
-bool Serial::impl_is_initialized() { return Impl::g_serial_is_initialized; }
+bool Serial::impl_is_initialized() {
+  return Impl::SerialInternal::singleton().is_initialized();
+}
 
 void Serial::impl_initialize() {
-  Impl::SharedAllocationRecord<void, void>::tracking_enable();
-
-  // Init the array of locks used for arbitrarily sized atomics
-  Impl::init_lock_array_host_space();
-
-  Impl::g_serial_is_initialized = true;
+  Impl::SerialInternal::singleton().initialize();
 }
 
-void Serial::impl_finalize() {
-  if (Impl::g_serial_thread_team_data.scratch_buffer()) {
-    Impl::g_serial_thread_team_data.disband_team();
-    Impl::g_serial_thread_team_data.disband_pool();
-
-    Kokkos::HostSpace space;
-
-    space.deallocate(Impl::g_serial_thread_team_data.scratch_buffer(),
-                     Impl::g_serial_thread_team_data.scratch_bytes());
-
-    Impl::g_serial_thread_team_data.scratch_assign(nullptr, 0, 0, 0, 0, 0);
-  }
-
-  Kokkos::Profiling::finalize();
-
-  Impl::g_serial_is_initialized = false;
-}
+void Serial::impl_finalize() { Impl::SerialInternal::singleton().finalize(); }
 
 const char* Serial::name() { return "Serial"; }
 
@@ -198,6 +210,9 @@ void SerialSpaceInitializer::finalize(const bool) {
 }
 
 void SerialSpaceInitializer::fence() { Kokkos::Serial::impl_static_fence(); }
+void SerialSpaceInitializer::fence(const std::string& name) {
+  Kokkos::Serial::impl_static_fence(name);
+}
 
 void SerialSpaceInitializer::print_configuration(std::ostream& msg,
                                                  const bool detail) {
diff --git a/packages/kokkos/core/src/impl/Kokkos_Serial_Task.hpp b/packages/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
index 3ac3899acaf9f3695025472fc5f02cb0708f64fb..be732f4486d4618b4f8601d1859be44ce1a31296 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
@@ -76,14 +76,18 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Serial, QueueType> > {
   static void execute(scheduler_type const& scheduler) {
     using task_base_type = typename scheduler_type::task_base_type;
 
+    auto const& serial_execution_space = scheduler.get_execution_space();
+
     // Set default buffers
-    serial_resize_thread_team_data(0,   /* global reduce buffer */
-                                   512, /* team reduce buffer */
-                                   0,   /* team shared buffer */
-                                   0    /* thread local buffer */
-    );
+    serial_execution_space.impl_internal_space_instance()
+        ->resize_thread_team_data(0,   /* global reduce buffer */
+                                  512, /* team reduce buffer */
+                                  0,   /* team shared buffer */
+                                  0    /* thread local buffer */
+        );
 
-    Impl::HostThreadTeamData& self = *Impl::serial_get_thread_team_data();
+    auto& self = serial_execution_space.impl_internal_space_instance()
+                     ->m_thread_team_data;
 
     auto& queue         = scheduler.queue();
     auto team_scheduler = scheduler.get_team_scheduler(0);
@@ -147,9 +151,11 @@ class TaskQueueSpecializationConstrained<
 
     task_base_type* const end = (task_base_type*)task_base_type::EndTag;
 
-    Impl::HostThreadTeamData* const data = Impl::serial_get_thread_team_data();
+    execution_space serial_execution_space;
+    auto& data = serial_execution_space.impl_internal_space_instance()
+                     ->m_thread_team_data;
 
-    member_type exec(scheduler, *data);
+    member_type exec(scheduler, data);
 
     // Loop until no runnable task
 
@@ -181,18 +187,22 @@ class TaskQueueSpecializationConstrained<
 
     task_base_type* const end = (task_base_type*)task_base_type::EndTag;
 
+    execution_space serial_execution_space;
+
     // Set default buffers
-    serial_resize_thread_team_data(0,   /* global reduce buffer */
-                                   512, /* team reduce buffer */
-                                   0,   /* team shared buffer */
-                                   0    /* thread local buffer */
-    );
+    serial_execution_space.impl_internal_space_instance()
+        ->resize_thread_team_data(0,   /* global reduce buffer */
+                                  512, /* team reduce buffer */
+                                  0,   /* team shared buffer */
+                                  0    /* thread local buffer */
+        );
 
     auto* const queue = scheduler.m_queue;
 
-    Impl::HostThreadTeamData* const data = Impl::serial_get_thread_team_data();
+    auto& data = serial_execution_space.impl_internal_space_instance()
+                     ->m_thread_team_data;
 
-    member_type exec(scheduler, *data);
+    member_type exec(scheduler, data);
 
     // Loop until all queues are empty
     while (0 < queue->m_ready_count) {
@@ -210,16 +220,6 @@ class TaskQueueSpecializationConstrained<
 
         (*task->m_apply)(task, &exec);
 
-#if 0
-        printf( "TaskQueue<Serial>::executed: 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
-        , uintptr_t(task)
-        , uintptr_t(task->m_wait)
-        , uintptr_t(task->m_next)
-        , task->m_task_type
-        , task->m_priority
-        , task->m_ref_count );
-#endif
-
         // If a respawn then re-enqueue otherwise the task is complete
         // and all tasks waiting on this task are updated.
         queue->complete(task);
diff --git a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
index 917ae72081c6a5eee98b4e02827097446cb29e0b..3efff98e459e8a8b92983d195a7a4486672bdce4 100644
--- a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
@@ -259,6 +259,9 @@ SharedAllocationRecord<void, void>* SharedAllocationRecord<
     while ((root_next = Kokkos::atomic_exchange(&arg_record->m_root->m_next,
                                                 zero)) == nullptr)
       ;
+    // We need a memory_fence() here so that the following update
+    // is properly sequenced
+    Kokkos::memory_fence();
 
     arg_record->m_next->m_prev = arg_record->m_prev;
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp b/packages/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp
index 0773a0914befe4e9db3b3b79ae3c446bcb0f3ad1..7f222c92ca704908e7e6be05229c976d113395f9 100644
--- a/packages/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp
@@ -55,7 +55,6 @@
 //----------------------------------------------------------------------------
 
 #include <Kokkos_MemoryPool.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 #include <Kokkos_Future.hpp>
 #include <impl/Kokkos_TaskQueue.hpp>
diff --git a/packages/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp b/packages/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp
index a0eccffb627f39f1810978aa0d3ab25c9458e4e8..0584cd29eb70470f4c206317d35a57f24893a518 100644
--- a/packages/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp
@@ -58,8 +58,7 @@
 
 #include <impl/Kokkos_TaskQueueMemoryManager.hpp>
 #include <impl/Kokkos_TaskQueueCommon.hpp>
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <impl/Kokkos_Atomic_Increment.hpp>
+#include <Kokkos_Atomic.hpp>
 #include <impl/Kokkos_OptionalRef.hpp>
 #include <impl/Kokkos_LIFO.hpp>
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_Tags.hpp b/packages/kokkos/core/src/impl/Kokkos_Tags.hpp
deleted file mode 100644
index eea4c938661afa00f4dad929312bf6cfa2b83776..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/src/impl/Kokkos_Tags.hpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_TAGS_HPP
-#define KOKKOS_TAGS_HPP
-
-#include <impl/Kokkos_Traits.hpp>
-#include <Kokkos_Core_fwd.hpp>
-#include <type_traits>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-/** KOKKOS_IMPL_HAS_TYPE( Type )
- *
- * defines a meta-function that check if a type expose an internal alias which
- * matches Type
- *
- * e.g.
- *   KOKKOS_IMPL_HAS_TYPE( array_layout );
- *   struct Foo { using array_layout = void; };
- *   have_array_layout<Foo>::value == 1;
- */
-#define KOKKOS_IMPL_HAS_TYPE(TYPE)                                             \
-  template <typename T>                                                        \
-  struct have_##TYPE {                                                         \
-   private:                                                                    \
-    template <typename U, typename = void>                                     \
-    struct X : std::false_type {};                                             \
-    template <typename U>                                                      \
-    struct X<U, typename std::conditional<true, void, typename X::TYPE>::type> \
-        : std::true_type {};                                                   \
-                                                                               \
-   public:                                                                     \
-    using type = typename X<T>::type;                                          \
-    enum : bool { value = type::value };                                       \
-  };
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template <typename T>
-using is_void = std::is_same<void, T>;
-
-}
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskBase.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskBase.hpp
index 2d0f62a563712a1182849fd8dc43349f6996a42e..06581052a8f687aeaff85d1368fd271407f0c36e 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskBase.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskBase.hpp
@@ -203,14 +203,17 @@ class TaskBase {
 
     // Assign dependence to m_next.  It will be processed in the subsequent
     // call to schedule.  Error if the dependence is reset.
-    if (lock != Kokkos::atomic_exchange(&m_next, dep)) {
+    if (lock != Kokkos::Impl::desul_atomic_exchange(
+                    &m_next, dep, Kokkos::Impl::MemoryOrderSeqCst(),
+                    Kokkos::Impl::MemoryScopeDevice())) {
       Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
     }
-
     if (nullptr != dep) {
       // The future may be destroyed upon returning from this call
       // so increment reference count to track this assignment.
-      Kokkos::atomic_increment(&(dep->m_ref_count));
+      Kokkos::Impl::desul_atomic_inc(&(dep->m_ref_count),
+                                     Kokkos::Impl::MemoryOrderSeqCst(),
+                                     Kokkos::Impl::MemoryScopeDevice());
     }
   }
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskNode.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskNode.hpp
index 42afa93cdcc4db4f4c0223d7b85f5edb8256ee31..caf1d0a84b82e3e6b121476c8cfd0213775dd69f 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskNode.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskNode.hpp
@@ -151,6 +151,7 @@ class ReferenceCountedBase {
   bool decrement_and_check_reference_count() {
     // TODO @tasking @memory_order DSH memory order
     auto old_count = Kokkos::atomic_fetch_add(&m_ref_count, -1);
+    Kokkos::memory_fence();
 
     KOKKOS_ASSERT(old_count > 0 && "reference count greater less than zero!");
 
@@ -158,7 +159,11 @@ class ReferenceCountedBase {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void increment_reference_count() { Kokkos::atomic_increment(&m_ref_count); }
+  void increment_reference_count() {
+    Kokkos::Impl::desul_atomic_inc(&m_ref_count,
+                                   Kokkos::Impl::MemoryOrderSeqCst(),
+                                   Kokkos::Impl::MemoryScopeDevice());
+  }
 };
 
 template <class TaskQueueTraits, class SchedulingInfo>
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
index c0d2eca9c106305e1bfdeb8efd634f657df90c8b..e74e84a2e535b5953ae58667a2a7f4b4f53b293d 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
@@ -58,8 +58,7 @@
 #include <impl/Kokkos_TaskBase.hpp>
 #include <impl/Kokkos_TaskResult.hpp>
 
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <impl/Kokkos_Atomic_Increment.hpp>
+#include <Kokkos_Atomic.hpp>
 #include <impl/Kokkos_OptionalRef.hpp>
 #include <impl/Kokkos_LIFO.hpp>
 
@@ -188,25 +187,11 @@ class TaskQueue : public TaskQueueBase {
   // Assign task pointer with reference counting of assigned tasks
   KOKKOS_FUNCTION static void assign(task_root_type** const lhs,
                                      task_root_type* const rhs) {
-#if 0
-  {
-    printf( "assign( 0x%lx { 0x%lx %d %d } , 0x%lx { 0x%lx %d %d } )\n"
-          , uintptr_t( lhs ? *lhs : 0 )
-          , uintptr_t( lhs && *lhs ? (*lhs)->m_next : 0 )
-          , int( lhs && *lhs ? (*lhs)->m_task_type : 0 )
-          , int( lhs && *lhs ? (*lhs)->m_ref_count : 0 )
-          , uintptr_t(rhs)
-          , uintptr_t( rhs ? rhs->m_next : 0 )
-          , int( rhs ? rhs->m_task_type : 0 )
-          , int( rhs ? rhs->m_ref_count : 0 )
-          );
-    fflush( stdout );
-  }
-#endif
-
     if (*lhs) decrement(*lhs);
     if (rhs) {
-      Kokkos::atomic_increment(&(rhs->m_ref_count));
+      Kokkos::Impl::desul_atomic_inc(&rhs->m_ref_count,
+                                     Kokkos::Impl::MemoryOrderSeqCst(),
+                                     Kokkos::Impl::MemoryScopeDevice());
     }
 
     // Force write of *lhs
@@ -234,13 +219,7 @@ class TaskQueue : public TaskQueueBase {
 
     using task_type = Impl::Task<execution_space, value_type, FunctorType>;
 
-    enum : size_t { align = (1 << 4), align_mask = align - 1 };
-    enum : size_t { task_size = sizeof(task_type) };
-    enum : size_t { result_size = Impl::TaskResult<value_type>::size };
-    enum : size_t {
-      alloc_size = ((task_size + align_mask) & ~align_mask) +
-                   ((result_size + align_mask) & ~align_mask)
-    };
+    constexpr size_t task_size = sizeof(task_type);
 
     return m_memory.allocate_block_size(task_size);
   }
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp
index cae06d4ea5ca17b5924a7bbf8c415d6f0a3ab070..757e5f98864bc3c74faa8f4bdfffb795e68aab60 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp
@@ -57,8 +57,7 @@
 #include <impl/Kokkos_TaskResult.hpp>
 
 #include <impl/Kokkos_TaskQueueMemoryManager.hpp>
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <impl/Kokkos_Atomic_Increment.hpp>
+#include <Kokkos_Atomic.hpp>
 #include <impl/Kokkos_OptionalRef.hpp>
 #include <impl/Kokkos_LIFO.hpp>
 
@@ -88,6 +87,7 @@ class TaskQueueCommonMixin {
   // <editor-fold desc="Constructors, destructor, and assignment"> {{{2
 
   TaskQueueCommonMixin() : m_ready_count(0) {
+    Kokkos::memory_fence();
     // TODO @tasking @memory_order DSH figure out if I need this store to be
     // atomic
   }
@@ -158,14 +158,17 @@ class TaskQueueCommonMixin {
   KOKKOS_INLINE_FUNCTION
   void _increment_ready_count() {
     // TODO @tasking @memory_order DSH memory order
-    Kokkos::atomic_increment(&this->m_ready_count);
+    Kokkos::Impl::desul_atomic_inc(&this->m_ready_count,
+                                   Kokkos::Impl::MemoryOrderSeqCst(),
+                                   Kokkos::Impl::MemoryScopeDevice());
   }
 
   KOKKOS_INLINE_FUNCTION
   void _decrement_ready_count() {
     // TODO @tasking @memory_order DSH memory order
-    Kokkos::atomic_decrement(&this->m_ready_count);
-    Kokkos::memory_fence();
+    Kokkos::Impl::desul_atomic_dec(&this->m_ready_count,
+                                   Kokkos::Impl::MemoryOrderSeqCst(),
+                                   Kokkos::Impl::MemoryScopeDevice());
   }
 
  public:
@@ -476,7 +479,7 @@ class TaskQueueCommonMixin {
   }
 
   template <class ExecutionSpace, class MemorySpace, class MemoryPool>
-  static /* KOKKOS_CONSTEXPR_14 */ size_t task_queue_allocation_size(
+  static /* constexpr */ size_t task_queue_allocation_size(
       ExecutionSpace const&, MemorySpace const&, MemoryPool const&)
   // requires Same<ExecutionSpace, typename Derived::execution_space>
   //            && Same<MemorySpace, typename Derived::memory_space>
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp
index 6e2481f93567a671a5ad66f3536b45c009286eca..3a71aa17e69042c791e9e7302c3c14cdca91b8aa 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp
@@ -56,8 +56,7 @@
 #include <impl/Kokkos_TaskBase.hpp>
 #include <impl/Kokkos_TaskResult.hpp>
 
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <impl/Kokkos_Atomic_Increment.hpp>
+#include <Kokkos_Atomic.hpp>
 #include <impl/Kokkos_OptionalRef.hpp>
 #include <impl/Kokkos_LIFO.hpp>
 
@@ -103,8 +102,9 @@ class TaskQueueMemoryManager : public TaskQueueBase {
     } else {
       void* data = m_pool.allocate(static_cast<size_t>(requested_size));
 
-      // Kokkos::atomic_increment(&m_accum_alloc); // memory_order_relaxed
-      Kokkos::atomic_increment(&m_count_alloc);  // memory_order_relaxed
+      Kokkos::Impl::desul_atomic_inc(
+          &m_count_alloc, Kokkos::Impl::MemoryOrderSeqCst(),
+          Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
       // TODO @tasking @minor DSH make this thread safe? (otherwise, it's just
       // an approximation, which is probably fine...)
       if (m_max_alloc < m_count_alloc) m_max_alloc = m_count_alloc;
@@ -200,7 +200,9 @@ class TaskQueueMemoryManager : public TaskQueueBase {
   KOKKOS_INLINE_FUNCTION void deallocate(
       PoolAllocatedObjectBase<CountType>&& obj) {
     m_pool.deallocate((void*)&obj, 1);
-    Kokkos::atomic_decrement(&m_count_alloc);  // memory_order_relaxed
+    Kokkos::Impl::desul_atomic_dec(
+        &m_count_alloc, Kokkos::Impl::MemoryOrderSeqCst(),
+        Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
   }
 
   KOKKOS_INLINE_FUNCTION
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp
index efee3d051dc8fb4e112219527bb322404ff4dfe6..5f98e8d85e9214289ce43f98b920ec27da4a672f 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp
@@ -59,9 +59,7 @@
 #include <impl/Kokkos_TaskResult.hpp>
 #include <impl/Kokkos_TaskQueue.hpp>
 
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <impl/Kokkos_Atomic_Increment.hpp>
-#include <impl/Kokkos_Atomic_Decrement.hpp>
+#include <Kokkos_Atomic.hpp>
 
 #include <string>
 #include <typeinfo>
@@ -159,8 +157,14 @@ class TaskQueueMultiple : public TaskQueue<ExecSpace, MemorySpace> {
               // task stolen.
               // first increment our ready count, then decrement the ready count
               // on the other queue:
-              Kokkos::atomic_increment(&this->m_ready_count);
-              Kokkos::atomic_decrement(&steal_from.m_ready_count);
+              Kokkos::Impl::desul_atomic_inc(
+                  &this->m_ready_count, Kokkos::Impl::MemoryOrderSeqCst(),
+                  Kokkos::Impl::MemoryScopeDevice());  // TODO?
+                                                       // memory_order_relaxed
+              Kokkos::Impl::desul_atomic_dec(
+                  &steal_from.m_ready_count, Kokkos::Impl::MemoryOrderSeqCst(),
+                  Kokkos::Impl::MemoryScopeDevice());  // TODO?
+                                                       // memory_order_relaxed
               return rv;
             }
           }
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
index a87e5f72721f95f06d1c9f90c17e92d0e1fec2fb..324227cf5e615f184492de9471fa0f78700ae11a 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
@@ -105,6 +105,7 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::decrement(
   task_root_type volatile &t = *task;
 
   const int count = Kokkos::atomic_fetch_add(&(t.m_ref_count), -1);
+  Kokkos::memory_fence();
 
 #if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
   if (1 == count) {
@@ -146,8 +147,9 @@ KOKKOS_FUNCTION void *TaskQueue<ExecSpace, MemorySpace>::allocate(size_t n) {
   void *const p = m_memory.allocate(n);
 
   if (p) {
-    // Kokkos::atomic_increment( & m_accum_alloc );
-    Kokkos::atomic_increment(&m_count_alloc);
+    Kokkos::Impl::desul_atomic_inc(
+        &m_count_alloc, Kokkos::Impl::MemoryOrderSeqCst(),
+        Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
 
     // if ( m_max_alloc < m_count_alloc ) m_max_alloc = m_count_alloc ;
   }
@@ -159,7 +161,9 @@ template <typename ExecSpace, typename MemorySpace>
 KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::deallocate(void *p,
                                                                    size_t n) {
   m_memory.deallocate(p, n);
-  Kokkos::atomic_decrement(&m_count_alloc);
+  Kokkos::Impl::desul_atomic_dec(
+      &m_count_alloc, Kokkos::Impl::MemoryOrderSeqCst(),
+      Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
 }
 
 //----------------------------------------------------------------------------
@@ -210,7 +214,9 @@ KOKKOS_FUNCTION bool TaskQueue<ExecSpace, MemorySpace>::push_task(
     //     *queue = task;
     //   }
     //   old_head = *queue;
-    old_head = Kokkos::atomic_compare_exchange(queue, old_head, task);
+    old_head = Kokkos::Impl::desul_atomic_compare_exchange(
+        const_cast<task_root_type **>(queue), old_head, task,
+        Kokkos::Impl::MemoryOrderSeqCst(), Kokkos::Impl::MemoryScopeDevice());
 
     if (old_head_tmp == old_head) return true;
   }
@@ -258,7 +264,10 @@ TaskQueue<ExecSpace, MemorySpace>::pop_ready_task(
 
     task_root_type *const x = task;
 
-    task = Kokkos::atomic_compare_exchange(queue, x, lock);
+    //    task = Kokkos::atomic_compare_exchange(queue, x, lock);
+    task = Kokkos::Impl::desul_atomic_compare_exchange(
+        const_cast<task_root_type **>(queue), x, lock,
+        Kokkos::Impl::MemoryOrderSeqCst(), Kokkos::Impl::MemoryScopeDevice());
 
     if (x == task) {
       // CAS succeeded and queue is locked
@@ -274,6 +283,8 @@ TaskQueue<ExecSpace, MemorySpace>::pop_ready_task(
       // This thread has exclusive access to
       // the queue and the popped task's m_next.
 
+      Kokkos::memory_fence();
+
       task_root_type *volatile &next = task->m_next;
 
       // This algorithm is not lockfree because a adversarial scheduler could
@@ -400,7 +411,9 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::schedule_runnable(
     // to track number of ready + executing tasks.
     // The ready count will be decremented when the task is complete.
 
-    Kokkos::atomic_increment(&m_ready_count);
+    Kokkos::Impl::desul_atomic_inc(
+        &m_ready_count, Kokkos::Impl::MemoryOrderSeqCst(),
+        Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
 
     task_root_type *volatile *const ready_queue =
         &m_ready[t.m_priority][t.m_task_type];
@@ -553,8 +566,9 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::reschedule(
 
   task_root_type *const zero = nullptr;
   task_root_type *const lock = (task_root_type *)task_root_type::LockTag;
-
-  if (lock != Kokkos::atomic_exchange(&task->m_next, zero)) {
+  if (lock != Kokkos::Impl::desul_atomic_exchange(
+                  &task->m_next, zero, Kokkos::Impl::MemoryOrderSeqCst(),
+                  Kokkos::Impl::MemoryScopeDevice())) {
     Kokkos::abort("TaskScheduler::respawn ERROR: already respawned");
   }
 }
@@ -601,8 +615,9 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::complete(
 
     // Stop other tasks from adding themselves to this task's wait queue
     // by locking the head of this task's wait queue.
-
-    task_root_type *x = Kokkos::atomic_exchange(&t.m_wait, lock);
+    task_root_type *x = Kokkos::Impl::desul_atomic_exchange(
+        const_cast<task_root_type **>(&t.m_wait), lock,
+        Kokkos::Impl::MemoryOrderSeqCst(), Kokkos::Impl::MemoryScopeDevice());
 
     if (x != (task_root_type *)lock) {
       // This thread has transitioned this 'task' to complete.
@@ -645,7 +660,9 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::complete(
     // A runnable task was popped from a ready queue and executed.
     // If respawned into a ready queue then the ready count was incremented
     // so decrement whether respawned or not.
-    Kokkos::atomic_decrement(&m_ready_count);
+    Kokkos::Impl::desul_atomic_dec(
+        &m_ready_count, Kokkos::Impl::MemoryOrderSeqCst(),
+        Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
   }
 }
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp b/packages/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp
index 2faab5794907ecd43ccead5f74e09e414a8f3541..f53dfe5a96621a0e31d3deb95bb83ac9bef35907 100644
--- a/packages/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp
@@ -55,7 +55,6 @@
 //----------------------------------------------------------------------------
 
 #include <Kokkos_MemoryPool.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 #include <Kokkos_Future.hpp>
 #include <impl/Kokkos_TaskQueue.hpp>
diff --git a/packages/kokkos/core/src/impl/Kokkos_Timer.hpp b/packages/kokkos/core/src/impl/Kokkos_Timer.hpp
index e8004ff85258975d3f36ee2a9345414e27ea09ad..6edf571d7892e4260fc7d617a1a86a63d265baa2 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Timer.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Timer.hpp
@@ -45,8 +45,13 @@
 #ifndef KOKKOS_IMPLWALLTIME_HPP
 #define KOKKOS_IMPLWALLTIME_HPP
 
+#include <Kokkos_Macros.hpp>
+
+KOKKOS_IMPL_WARNING("This file is deprecated. Use <Kokkos_Timer.hpp> instead.")
+
 #include <Kokkos_Timer.hpp>
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 namespace Kokkos {
 namespace Impl {
 
@@ -54,10 +59,11 @@ namespace Impl {
  *   Timer promoted from Impl to Kokkos ns
  *   This file included for backwards compatibility
  */
-
-using Kokkos::Timer;
+using Timer KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::Timer instead!") =
+    Kokkos::Timer;
 
 }  // namespace Impl
 }  // namespace Kokkos
+#endif
 
 #endif /* #ifndef KOKKOS_IMPLWALLTIME_HPP */
diff --git a/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp b/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp
index cb8cf281ae06fe0a71862b47428a2ffa12f4bd67..bea7c2c9d1e56a61802bc47c60f00e82496c8061 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp
@@ -65,13 +65,6 @@ struct identity {
 template <typename T>
 using identity_t = typename identity<T>::type;
 
-struct not_a_type {
-  not_a_type()                  = delete;
-  ~not_a_type()                 = delete;
-  not_a_type(not_a_type const&) = delete;
-  void operator=(not_a_type const&) = delete;
-};
-
 #if defined(__cpp_lib_void_t)
 // since C++17
 using std::void_t;
@@ -158,6 +151,112 @@ struct destruct_delete {
 template <class...>
 struct type_list;
 
+//------------------------------------------------------------------------------
+// <editor-fold desc="type_list_remove_first"> {{{2
+
+// Currently linear complexity; if we use this a lot, maybe make it better?
+
+template <class Entry, class InList, class OutList>
+struct _type_list_remove_first_impl;
+
+template <class Entry, class T, class... Ts, class... OutTs>
+struct _type_list_remove_first_impl<Entry, type_list<T, Ts...>,
+                                    type_list<OutTs...>>
+    : _type_list_remove_first_impl<Entry, type_list<Ts...>,
+                                   type_list<OutTs..., T>> {};
+
+template <class Entry, class... Ts, class... OutTs>
+struct _type_list_remove_first_impl<Entry, type_list<Entry, Ts...>,
+                                    type_list<OutTs...>>
+    : _type_list_remove_first_impl<Entry, type_list<>,
+                                   type_list<OutTs..., Ts...>> {};
+
+template <class Entry, class... OutTs>
+struct _type_list_remove_first_impl<Entry, type_list<>, type_list<OutTs...>>
+    : identity<type_list<OutTs...>> {};
+
+template <class Entry, class List>
+struct type_list_remove_first
+    : _type_list_remove_first_impl<Entry, List, type_list<>> {};
+
+// </editor-fold> end type_list_remove_first }}}2
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// <editor-fold desc="type_list_any"> {{{2
+
+template <template <class> class UnaryPred, class List>
+struct type_list_any;
+
+#ifdef KOKKOS_ENABLE_CXX17
+template <template <class> class UnaryPred, class... Ts>
+struct type_list_any<UnaryPred, type_list<Ts...>>
+    : std::bool_constant<(UnaryPred<Ts>::value || ...)> {};
+#else
+template <template <class> class UnaryPred, class T, class... Ts>
+struct type_list_any<UnaryPred, type_list<T, Ts...>> {
+  using type = typename std::conditional_t<
+      UnaryPred<T>::value, std::true_type,
+      type_list_any<UnaryPred, type_list<Ts...>>>::type;
+  static constexpr auto value = type::value;
+};
+
+template <template <class> class UnaryPred>
+struct type_list_any<UnaryPred, type_list<>> : std::false_type {};
+
+#endif
+
+// </editor-fold> end type_list_any }}}2
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// <editor-fold desc="concat_type_list"> {{{2
+//  concat_type_list combines types in multiple type_lists
+
+// forward declaration
+template <typename... T>
+struct concat_type_list;
+
+// alias
+template <typename... T>
+using concat_type_list_t = typename concat_type_list<T...>::type;
+
+// final instantiation
+template <typename... T>
+struct concat_type_list<type_list<T...>> {
+  using type = type_list<T...>;
+};
+
+// combine consecutive type_lists
+template <typename... T, typename... U, typename... Tail>
+struct concat_type_list<type_list<T...>, type_list<U...>, Tail...>
+    : concat_type_list<type_list<T..., U...>, Tail...> {};
+// </editor-fold> end concat_type_list }}}2
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// <editor-fold desc="filter_type_list"> {{{2
+//  filter_type_list generates type-list of types which satisfy
+//  PredicateT<T>::value == ValueT
+
+template <template <typename> class PredicateT, typename TypeListT,
+          bool ValueT = true>
+struct filter_type_list;
+
+template <template <typename> class PredicateT, typename... T, bool ValueT>
+struct filter_type_list<PredicateT, type_list<T...>, ValueT> {
+  using type =
+      concat_type_list_t<std::conditional_t<PredicateT<T>::value == ValueT,
+                                            type_list<T>, type_list<>>...>;
+};
+
+template <template <typename> class PredicateT, typename T, bool ValueT = true>
+using filter_type_list_t =
+    typename filter_type_list<PredicateT, T, ValueT>::type;
+
+// </editor-fold> end filter_type_list }}}2
+//------------------------------------------------------------------------------
+
 // </editor-fold> end type_list }}}1
 //==============================================================================
 
diff --git a/packages/kokkos/core/src/impl/Kokkos_VLAEmulation.hpp b/packages/kokkos/core/src/impl/Kokkos_VLAEmulation.hpp
index 41607a2a8e7fedc56fd92fdfa9a472bda10bc547..ace826dd5a7f726cd9e0e2b3ce14b081a26680f2 100644
--- a/packages/kokkos/core/src/impl/Kokkos_VLAEmulation.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_VLAEmulation.hpp
@@ -130,20 +130,20 @@ struct ObjectWithVLAEmulation {
   // CRTP boilerplate
 
   KOKKOS_FORCEINLINE_FUNCTION
-  /* KOKKOS_CONSTEXPR_14 */
+  /* constexpr */
   Derived* _this() noexcept {
     return VLAEmulationAccess::_cast_to_derived(this);
   }
 
   KOKKOS_FORCEINLINE_FUNCTION
-  /* KOKKOS_CONSTEXPR_14 */
+  /* constexpr */
   Derived const* _this() const noexcept {
     return VLAEmulationAccess::_cast_to_derived(this);
   }
 
   // Note: can't be constexpr because of reinterpret_cast
   KOKKOS_FORCEINLINE_FUNCTION
-  /* KOKKOS_CONSTEXPR_14 */
+  /* constexpr */
   vla_value_type* _vla_pointer() noexcept {
     // The data starts right after the aligned storage of Derived
     return reinterpret_cast<vla_value_type*>(_this() + 1);
@@ -151,7 +151,7 @@ struct ObjectWithVLAEmulation {
 
   // Note: can't be constexpr because of reinterpret_cast
   KOKKOS_FORCEINLINE_FUNCTION
-  /* KOKKOS_CONSTEXPR_14 */
+  /* constexpr */
   vla_value_type const* _vla_pointer() const noexcept {
     // The data starts right after the aligned storage of Derived
     return reinterpret_cast<vla_value_type const*>(_this() + 1);
@@ -159,7 +159,7 @@ struct ObjectWithVLAEmulation {
 
  public:
   KOKKOS_INLINE_FUNCTION
-  static /* KOKKOS_CONSTEXPR_14 */ size_t required_allocation_size(
+  static /* constexpr */ size_t required_allocation_size(
       vla_entry_count_type num_vla_entries) {
     KOKKOS_EXPECTS(num_vla_entries >= 0);
     return sizeof(Derived) + num_vla_entries * sizeof(VLAValueType);
diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
index b9e32a04e09afcf1a5fcbeba0bd81257631f7714..797b3f584b6234b290a809e7f1c502e04249f058 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
@@ -144,10 +144,10 @@ struct ViewCtorProp<typename std::enable_if<is_view_label<Label>::value>::type,
 };
 
 template <typename Space>
-struct ViewCtorProp<typename std::enable_if<
-                        Kokkos::Impl::is_memory_space<Space>::value ||
-                        Kokkos::Impl::is_execution_space<Space>::value>::type,
-                    Space> {
+struct ViewCtorProp<
+    typename std::enable_if<Kokkos::is_memory_space<Space>::value ||
+                            Kokkos::is_execution_space<Space>::value>::type,
+    Space> {
   ViewCtorProp()                     = default;
   ViewCtorProp(const ViewCtorProp &) = default;
   ViewCtorProp &operator=(const ViewCtorProp &) = default;
@@ -207,10 +207,10 @@ template <typename... P>
 struct ViewCtorProp : public ViewCtorProp<void, P>... {
  private:
   using var_memory_space =
-      Kokkos::Impl::has_condition<void, Kokkos::Impl::is_memory_space, P...>;
+      Kokkos::Impl::has_condition<void, Kokkos::is_memory_space, P...>;
 
   using var_execution_space =
-      Kokkos::Impl::has_condition<void, Kokkos::Impl::is_execution_space, P...>;
+      Kokkos::Impl::has_condition<void, Kokkos::is_execution_space, P...>;
 
   struct VOIDDUMMY {};
 
@@ -270,7 +270,6 @@ struct ViewCtorProp : public ViewCtorProp<void, P>... {
 
 namespace Kokkos {
 
-/* For backward compatibility */
 namespace Impl {
 struct ViewAllocateWithoutInitializingBackwardCompat {};
 
@@ -291,7 +290,6 @@ struct ViewCtorProp<WithoutInitializing_t, std::string,
 };
 } /* namespace Impl */
 
-/*[[deprecated(Use Kokkos::alloc(Kokkos::WithoutInitializing, label) instead]]*/
 using ViewAllocateWithoutInitializing =
     Impl::ViewCtorProp<Impl::WithoutInitializing_t, std::string,
                        Impl::ViewAllocateWithoutInitializingBackwardCompat>;
diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
index a380a306931f4150e95b6f433c8bb076b091c456..9523118748f09907933764aea6ca94a085e68a9d 100644
--- a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
+++ b/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
@@ -49,6 +49,7 @@
 #include <initializer_list>
 
 #include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_DetectionIdiom.hpp>
 #include <Kokkos_Pair.hpp>
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_Extents.hpp>
@@ -862,7 +863,7 @@ struct ViewDataAnalysis {
 namespace Kokkos {
 namespace Impl {
 
-template <class Dimension, class Layout, typename Enable = void>
+template <class Dimension, class Layout, class Enable = void>
 struct ViewOffset {
   using is_mapping_plugin = std::false_type;
 };
@@ -1389,7 +1390,8 @@ struct ViewOffset<
     KOKKOS_INLINE_FUNCTION
     static constexpr size_t stride(size_t const N) {
       return ((align != 0) &&
-              ((Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align) < N) &&
+              ((static_cast<int>(Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD) *
+                static_cast<int>(align)) < N) &&
               ((N % div_ok) != 0))
                  ? N + align - (N % div_ok)
                  : N;
@@ -2022,7 +2024,8 @@ struct ViewOffset<
     KOKKOS_INLINE_FUNCTION
     static constexpr size_t stride(size_t const N) {
       return ((align != 0) &&
-              ((Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align) < N) &&
+              ((static_cast<int>(Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD) *
+                static_cast<int>(align)) < N) &&
               ((N % div_ok) != 0))
                  ? N + align - (N % div_ok)
                  : N;
@@ -2816,6 +2819,22 @@ struct ViewDataHandle<
 namespace Kokkos {
 namespace Impl {
 
+template <typename T>
+inline bool is_zero_byte(const T& t) {
+  using comparison_type = std::conditional_t<
+      sizeof(T) % sizeof(long long int) == 0, long long int,
+      std::conditional_t<
+          sizeof(T) % sizeof(long int) == 0, long int,
+          std::conditional_t<
+              sizeof(T) % sizeof(int) == 0, int,
+              std::conditional_t<sizeof(T) % sizeof(short int) == 0, short int,
+                                 char>>>>;
+  const auto* const ptr = reinterpret_cast<const comparison_type*>(&t);
+  for (std::size_t i = 0; i < sizeof(T) / sizeof(comparison_type); ++i)
+    if (ptr[i] != 0) return false;
+  return true;
+}
+
 //----------------------------------------------------------------------------
 
 /*
@@ -2826,16 +2845,16 @@ namespace Impl {
  *  called from the shared memory tracking destruction.
  *  Secondarily to have two fewer partial specializations.
  */
-template <class ExecSpace, class ValueType,
+template <class DeviceType, class ValueType,
           bool IsScalar = std::is_scalar<ValueType>::value>
 struct ViewValueFunctor;
 
-template <class ExecSpace, class ValueType>
-struct ViewValueFunctor<ExecSpace, ValueType, false /* is_scalar */> {
+template <class DeviceType, class ValueType>
+struct ViewValueFunctor<DeviceType, ValueType, false /* is_scalar */> {
+  using ExecSpace  = typename DeviceType::execution_space;
   using PolicyType = Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<int64_t>>;
-  using Exec       = typename ExecSpace::execution_space;
 
-  Exec space;
+  ExecSpace space;
   ValueType* ptr;
   size_t n;
   bool destroy;
@@ -2864,11 +2883,50 @@ struct ViewValueFunctor<ExecSpace, ValueType, false /* is_scalar */> {
         destroy(false),
         name(std::move(arg_name)) {}
 
-  void execute(bool arg) {
+  template <typename Dummy = ValueType>
+  std::enable_if_t<std::is_trivial<Dummy>::value &&
+                   std::is_trivially_copy_assignable<ValueType>::value>
+  construct_dispatch() {
+    ValueType value{};
+    if (Impl::is_zero_byte(value)) {
+      uint64_t kpID = 0;
+      if (Kokkos::Profiling::profileLibraryLoaded()) {
+        // We are not really using parallel_for here but using beginParallelFor
+        // instead of begin_parallel_for (and adding "via memset") is the best
+        // we can do to indicate that this is not supposed to be tunable (and
+        // doesn't really execute a parallel_for).
+        Kokkos::Profiling::beginParallelFor(
+            "Kokkos::View::initialization [" + name + "] via memset",
+            Kokkos::Profiling::Experimental::device_id(space), &kpID);
+      }
+
+      (void)ZeroMemset<ExecSpace, ValueType*, typename DeviceType::memory_space,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>(
+          space,
+          Kokkos::View<ValueType*, typename DeviceType::memory_space,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>(ptr, n),
+          value);
+
+      if (Kokkos::Profiling::profileLibraryLoaded()) {
+        Kokkos::Profiling::endParallelFor(kpID);
+      }
+    } else {
+      parallel_for_implementation(false);
+    }
+  }
+
+  template <typename Dummy = ValueType>
+  std::enable_if_t<!(std::is_trivial<Dummy>::value &&
+                     std::is_trivially_copy_assignable<ValueType>::value)>
+  construct_dispatch() {
+    parallel_for_implementation(false);
+  }
+
+  void parallel_for_implementation(bool arg) {
     destroy = arg;
-    PolicyType policy(0, n);
-    std::string functor_name;
     if (!space.in_parallel()) {
+      PolicyType policy(0, n);
+      std::string functor_name;
       uint64_t kpID = 0;
       if (Kokkos::Profiling::profileLibraryLoaded()) {
         functor_name =
@@ -2877,6 +2935,7 @@ struct ViewValueFunctor<ExecSpace, ValueType, false /* is_scalar */> {
         Kokkos::Tools::Impl::begin_parallel_for(policy, *this, functor_name,
                                                 kpID);
       }
+
 #ifdef KOKKOS_ENABLE_CUDA
       if (std::is_same<ExecSpace, Kokkos::Cuda>::value) {
         Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n,
@@ -2886,7 +2945,7 @@ struct ViewValueFunctor<ExecSpace, ValueType, false /* is_scalar */> {
       const Kokkos::Impl::ParallelFor<ViewValueFunctor, PolicyType> closure(
           *this, policy);
       closure.execute();
-      space.fence();
+      space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence");
       if (Kokkos::Profiling::profileLibraryLoaded()) {
         Kokkos::Tools::Impl::end_parallel_for(policy, *this, functor_name,
                                               kpID);
@@ -2896,13 +2955,14 @@ struct ViewValueFunctor<ExecSpace, ValueType, false /* is_scalar */> {
     }
   }
 
-  void construct_shared_allocation() { execute(false); }
+  void construct_shared_allocation() { construct_dispatch(); }
 
-  void destroy_shared_allocation() { execute(true); }
+  void destroy_shared_allocation() { parallel_for_implementation(true); }
 };
 
-template <class ExecSpace, class ValueType>
-struct ViewValueFunctor<ExecSpace, ValueType, true /* is_scalar */> {
+template <class DeviceType, class ValueType>
+struct ViewValueFunctor<DeviceType, ValueType, true /* is_scalar */> {
+  using ExecSpace  = typename DeviceType::execution_space;
   using PolicyType = Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<int64_t>>;
 
   ExecSpace space;
@@ -2921,12 +2981,54 @@ struct ViewValueFunctor<ExecSpace, ValueType, true /* is_scalar */> {
                    size_t const arg_n, std::string arg_name)
       : space(arg_space), ptr(arg_ptr), n(arg_n), name(std::move(arg_name)) {}
 
-  void construct_shared_allocation() {
-    if (!space.in_parallel()) {
+  template <typename Dummy = ValueType>
+  std::enable_if_t<std::is_trivial<Dummy>::value &&
+                   std::is_trivially_copy_assignable<Dummy>::value>
+  construct_shared_allocation() {
+    // Shortcut for zero initialization
+    ValueType value{};
+    if (Impl::is_zero_byte(value)) {
       uint64_t kpID = 0;
       if (Kokkos::Profiling::profileLibraryLoaded()) {
+        // We are not really using parallel_for here but using beginParallelFor
+        // instead of begin_parallel_for (and adding "via memset") is the best
+        // we can do to indicate that this is not supposed to be tunable (and
+        // doesn't really execute a parallel_for).
         Kokkos::Profiling::beginParallelFor(
-            "Kokkos::View::initialization [" + name + "]", 0, &kpID);
+            "Kokkos::View::initialization [" + name + "] via memset",
+            Kokkos::Profiling::Experimental::device_id(space), &kpID);
+      }
+
+      (void)ZeroMemset<ExecSpace, ValueType*, typename DeviceType::memory_space,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>(
+          space,
+          Kokkos::View<ValueType*, typename DeviceType::memory_space,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>(ptr, n),
+          value);
+
+      if (Kokkos::Profiling::profileLibraryLoaded()) {
+        Kokkos::Profiling::endParallelFor(kpID);
+      }
+    } else {
+      parallel_for_implementation();
+    }
+  }
+
+  template <typename Dummy = ValueType>
+  std::enable_if_t<!(std::is_trivial<Dummy>::value &&
+                     std::is_trivially_copy_assignable<Dummy>::value)>
+  construct_shared_allocation() {
+    parallel_for_implementation();
+  }
+
+  void parallel_for_implementation() {
+    if (!space.in_parallel()) {
+      PolicyType policy(0, n);
+      std::string functor_name = "Kokkos::View::initialization [" + name + "]";
+      uint64_t kpID            = 0;
+      if (Kokkos::Profiling::profileLibraryLoaded()) {
+        Kokkos::Tools::Impl::begin_parallel_for(policy, *this, functor_name,
+                                                kpID);
       }
 #ifdef KOKKOS_ENABLE_CUDA
       if (std::is_same<ExecSpace, Kokkos::Cuda>::value) {
@@ -2937,9 +3039,11 @@ struct ViewValueFunctor<ExecSpace, ValueType, true /* is_scalar */> {
       const Kokkos::Impl::ParallelFor<ViewValueFunctor, PolicyType> closure(
           *this, PolicyType(0, n));
       closure.execute();
-      space.fence();
+      space.fence(
+          "Kokkos::Impl::ViewValueFunctor: Fence after setting values in view");
       if (Kokkos::Profiling::profileLibraryLoaded()) {
-        Kokkos::Profiling::endParallelFor(kpID);
+        Kokkos::Tools::Impl::end_parallel_for(policy, *this, functor_name,
+                                              kpID);
       }
     } else {
       for (size_t i = 0; i < n; ++i) operator()(i);
@@ -3232,7 +3336,9 @@ class ViewMapping<
     using execution_space = typename alloc_prop::execution_space;
     using memory_space    = typename Traits::memory_space;
     using value_type      = typename Traits::value_type;
-    using functor_type    = ViewValueFunctor<execution_space, value_type>;
+    using functor_type =
+        ViewValueFunctor<Kokkos::Device<execution_space, memory_space>,
+                         value_type>;
     using record_type =
         Kokkos::Impl::SharedAllocationRecord<memory_space, functor_type>;
 
@@ -3314,17 +3420,10 @@ class ViewMapping<
                            Kokkos::LayoutStride>::value))))>::type> {
  private:
   enum {
-    is_assignable_space =
-#if 1
-        Kokkos::Impl::MemorySpaceAccess<
-            typename DstTraits::memory_space,
-            typename SrcTraits::memory_space>::assignable
-  };
-#else
-        std::is_same<typename DstTraits::memory_space,
-                     typename SrcTraits::memory_space>::value
+    is_assignable_space = Kokkos::Impl::MemorySpaceAccess<
+        typename DstTraits::memory_space,
+        typename SrcTraits::memory_space>::assignable
   };
-#endif
 
   enum {
     is_assignable_value_type =
@@ -3728,7 +3827,7 @@ class ViewMapping<
 
   template <class MemoryTraits>
   struct apply {
-    static_assert(Kokkos::Impl::is_memory_traits<MemoryTraits>::value, "");
+    static_assert(Kokkos::is_memory_traits<MemoryTraits>::value, "");
 
     using traits_type =
         Kokkos::ViewTraits<data_type, array_layout,
@@ -3842,24 +3941,21 @@ struct OperatorBoundsErrorOnDevice<MapType, true> {
    this defined by default.
    The existence of this alias indicates the existence of MapType::is_managed
  */
-template <class T, class Enable = void>
-struct has_printable_label_typedef : public std::false_type {};
-
 template <class T>
-struct has_printable_label_typedef<T,
-                                   void_t<typename T::printable_label_typedef>>
-    : public std::true_type {};
+using printable_label_typedef_t = typename T::printable_label_typedef;
 
-template <class MapType>
-KOKKOS_INLINE_FUNCTION void operator_bounds_error_on_device(MapType const&,
-                                                            std::false_type) {
+template <class Map>
+KOKKOS_FUNCTION
+    std::enable_if_t<!is_detected<printable_label_typedef_t, Map>::value>
+    operator_bounds_error_on_device(Map const&) {
   Kokkos::abort("View bounds error");
 }
 
-template <class MapType>
-KOKKOS_INLINE_FUNCTION void operator_bounds_error_on_device(MapType const& map,
-                                                            std::true_type) {
-  OperatorBoundsErrorOnDevice<MapType>::run(map);
+template <class Map>
+KOKKOS_FUNCTION
+    std::enable_if_t<is_detected<printable_label_typedef_t, Map>::value>
+    operator_bounds_error_on_device(Map const& map) {
+  OperatorBoundsErrorOnDevice<Map>::run(map);
 }
 
 #endif  // ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
@@ -3885,8 +3981,7 @@ KOKKOS_INLINE_FUNCTION void view_verify_operator_bounds(
        This check should cover the case of Views that don't
        have the Unmanaged trait but were initialized by pointer. */
     if (tracker.m_tracker.has_record()) {
-      operator_bounds_error_on_device<MapType>(
-          map, has_printable_label_typedef<MapType>());
+      operator_bounds_error_on_device(map);
     } else {
       Kokkos::abort("View bounds error");
     }
diff --git a/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp b/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp
index a5f5406746befc984f17f815e04bac63f0fadff4..d964baa8fb0f5e1b105d244740b74e32d1bdd69e 100644
--- a/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp
+++ b/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp
@@ -62,10 +62,10 @@ void sink(Args&&... args) {
     Kokkos::ImplSYCL::sink(__VA_ARGS__);   \
   } while (0)
 #else
-#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(format, ...)                       \
-  do {                                                                   \
-    static const __attribute__((opencl_constant)) char fmt[] = (format); \
-    sycl::ONEAPI::experimental::printf(fmt, ##__VA_ARGS__);              \
+#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(format, ...)                \
+  do {                                                            \
+    const __attribute__((opencl_constant)) char fmt[] = (format); \
+    sycl::ONEAPI::experimental::printf(fmt, ##__VA_ARGS__);       \
   } while (0)
 #endif
 #endif
diff --git a/packages/kokkos/core/src/traits/Kokkos_ExecutionSpaceTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_ExecutionSpaceTrait.hpp
index 4467b2e03c486d07d80c3fee66e6c3b50c42256e..e12d1f6a49d37f4f595214be00713c2ccb4166ef 100644
--- a/packages/kokkos/core/src/traits/Kokkos_ExecutionSpaceTrait.hpp
+++ b/packages/kokkos/core/src/traits/Kokkos_ExecutionSpaceTrait.hpp
@@ -56,6 +56,11 @@ namespace Impl {
 //==============================================================================
 // <editor-fold desc="trait specification"> {{{1
 
+template <class T>
+struct show_extra_execution_space_erroneously_given_to_execution_policy;
+template <>
+struct show_extra_execution_space_erroneously_given_to_execution_policy<void> {
+};
 struct ExecutionSpaceTrait : TraitSpecificationBase<ExecutionSpaceTrait> {
   struct base_traits {
     static constexpr auto execution_space_is_defaulted = true;
@@ -63,32 +68,30 @@ struct ExecutionSpaceTrait : TraitSpecificationBase<ExecutionSpaceTrait> {
     using execution_space = Kokkos::DefaultExecutionSpace;
   };
   template <class T>
-  using trait_matches_specification = is_execution_space<T>;
-};
-
-// </editor-fold> end trait specification }}}1
-//==============================================================================
+  using trait_matches_specification = Kokkos::is_execution_space<T>;
+  template <class ExecSpace, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
 
-//==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+    static constexpr auto show_execution_space_error_in_compilation_message =
+        show_extra_execution_space_erroneously_given_to_execution_policy<
+            std::conditional_t<base_t::execution_space_is_defaulted, void,
+                               typename base_t::execution_space>>{};
+    static_assert(base_t::execution_space_is_defaulted,
+                  "Kokkos Error: More than one execution space given. Search "
+                  "compiler output for 'show_extra_execution_space' to see the "
+                  "type of the errant tag.");
 
-template <class ExecutionSpace, class... Traits>
-struct AnalyzeExecPolicy<
-    std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value>,
-    ExecutionSpace, Traits...> : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
+    static constexpr auto execution_space_is_defaulted = false;
 
-  static_assert(base_t::execution_space_is_defaulted,
-                "Kokkos Error: More than one execution space given");
-
-  static constexpr bool execution_space_is_defaulted = false;
-
-  using execution_space = ExecutionSpace;
+    using execution_space = ExecSpace;
+  };
 };
 
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+// </editor-fold> end trait specification }}}1
 //==============================================================================
+
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/packages/kokkos/core/src/traits/Kokkos_GraphKernelTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_GraphKernelTrait.hpp
index eb649dc0887a2aab8c88feae8156676b70a7cdf7..b57dfbbc07ccc0e2391b2fdb5b6ec577ed552cc2 100644
--- a/packages/kokkos/core/src/traits/Kokkos_GraphKernelTrait.hpp
+++ b/packages/kokkos/core/src/traits/Kokkos_GraphKernelTrait.hpp
@@ -61,6 +61,12 @@ struct GraphKernelTrait : TraitSpecificationBase<GraphKernelTrait> {
   struct base_traits {
     using is_graph_kernel = std::false_type;
   };
+  template <class, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+    using is_graph_kernel = std::true_type;
+  };
   template <class T>
   using trait_matches_specification = std::is_same<T, IsGraphKernelTag>;
 };
@@ -68,19 +74,6 @@ struct GraphKernelTrait : TraitSpecificationBase<GraphKernelTrait> {
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
-//==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
-
-template <class... Traits>
-struct AnalyzeExecPolicy<void, Impl::IsGraphKernelTag, Traits...>
-    : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-  using is_graph_kernel = std::true_type;
-};
-
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
-//==============================================================================
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/packages/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp
index e15adc17116cb66481f90acc0b9ba5a83ec1ab52..63446375fbd529e82d71acf2bac5ef12fba238af 100644
--- a/packages/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp
+++ b/packages/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp
@@ -46,54 +46,71 @@
 #define KOKKOS_KOKKOS_INDEXTYPETRAIT_HPP
 
 #include <Kokkos_Macros.hpp>
-#include <Kokkos_Concepts.hpp>  // IndexType, is_index_type
+#include <Kokkos_Concepts.hpp>  // IndexType
 #include <traits/Kokkos_PolicyTraitAdaptor.hpp>
 #include <traits/Kokkos_Traits_fwd.hpp>
 
 namespace Kokkos {
 namespace Impl {
 
+template <class Trait, class AnalyzeNextTrait>
+struct IndexTypePolicyMixin;
+
 //==============================================================================
 // <editor-fold desc="trait specification"> {{{1
 
+template <class T>
+struct show_extra_index_type_erroneously_given_to_execution_policy;
+template <>
+struct show_extra_index_type_erroneously_given_to_execution_policy<void> {};
 struct IndexTypeTrait : TraitSpecificationBase<IndexTypeTrait> {
   struct base_traits {
     static constexpr bool index_type_is_defaulted = true;
     using index_type = dependent_policy_trait_default;
   };
-  template <class T>
-  using trait_matches_specification =
-      std::integral_constant<bool, std::is_integral<T>::value ||
-                                       is_index_type<T>::value>;
+  template <class IdxType, class AnalyzeNextTrait>
+  using mixin_matching_trait = IndexTypePolicyMixin<IdxType, AnalyzeNextTrait>;
 };
 
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
 //==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+// <editor-fold desc="IndexTypePolicyMixin specializations"> {{{1
 
 // Index type given as IndexType template
-template <class IntegralIndexType, class... Traits>
-struct AnalyzeExecPolicy<void, Kokkos::IndexType<IntegralIndexType>, Traits...>
-    : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
+template <class IntegralIndexType, class AnalyzeNextTrait>
+struct IndexTypePolicyMixin<Kokkos::IndexType<IntegralIndexType>,
+                            AnalyzeNextTrait> : AnalyzeNextTrait {
+  using base_t = AnalyzeNextTrait;
   using base_t::base_t;
+  static constexpr auto show_index_type_error_in_compilation_message =
+      show_extra_index_type_erroneously_given_to_execution_policy<
+          std::conditional_t<base_t::index_type_is_defaulted, void,
+                             typename base_t::schedule_type>>{};
   static_assert(base_t::index_type_is_defaulted,
-                "Kokkos Error: More than one index type given");
+                "Kokkos Error: More than one index type given. Search "
+                "compiler output for 'show_extra_index_type' to see the "
+                "type of the errant tag.");
   static constexpr bool index_type_is_defaulted = false;
   using index_type = Kokkos::IndexType<IntegralIndexType>;
 };
 
-// IndexType given as an integral type directly
-template <class IntegralIndexType, class... Traits>
-struct AnalyzeExecPolicy<
-    std::enable_if_t<std::is_integral<IntegralIndexType>::value>,
-    IntegralIndexType, Traits...> : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
+// IndexType given as an integral type directly (the matcher already checks
+// this, so we don't have specialize to re-check it here)
+template <class IntegralIndexType, class AnalyzeNextTrait>
+struct IndexTypePolicyMixin : AnalyzeNextTrait {
+  using base_t = AnalyzeNextTrait;
   using base_t::base_t;
+  static constexpr auto show_index_type_error_in_compilation_message =
+      show_extra_index_type_erroneously_given_to_execution_policy<
+          std::conditional_t<base_t::index_type_is_defaulted, void,
+                             typename base_t::schedule_type>>{};
   static_assert(base_t::index_type_is_defaulted,
-                "Kokkos Error: More than one index type given");
+                "Kokkos Error: More than one index type given. Search "
+                "compiler output for 'show_extra_index_type' to see the "
+                "type of the errant tag.");
+  static_assert(std::is_integral<IntegralIndexType>::value, "");
   static constexpr bool index_type_is_defaulted = false;
   using index_type = Kokkos::IndexType<IntegralIndexType>;
 };
@@ -101,6 +118,22 @@ struct AnalyzeExecPolicy<
 // </editor-fold> end AnalyzeExecPolicy specializations }}}1
 //==============================================================================
 
+//==============================================================================
+// <editor-fold desc="PolicyTraitMatcher specialization"> {{{1
+
+template <class IntegralIndexType>
+struct PolicyTraitMatcher<IndexTypeTrait, IndexType<IntegralIndexType>>
+    : std::true_type {};
+
+template <class IntegralIndexType>
+struct PolicyTraitMatcher<
+    IndexTypeTrait, IntegralIndexType,
+    std::enable_if_t<std::is_integral<IntegralIndexType>::value>>
+    : std::true_type {};
+
+// </editor-fold> end PolicyTraitMatcher specialization"> }}}1
+//==============================================================================
+
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/packages/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp
index 30e07039a405d61f2c78217284f9036a0a533f06..b05f3b29e976c503c120c4f59dc4ed81b01822f7 100644
--- a/packages/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp
+++ b/packages/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp
@@ -45,8 +45,11 @@
 #ifndef KOKKOS_KOKKOS_ITERATIONPATTERNTRAIT_HPP
 #define KOKKOS_KOKKOS_ITERATIONPATTERNTRAIT_HPP
 
-#include <Kokkos_Concepts.hpp>  // is_iteration_pattern
-#include <type_traits>          // is_void
+#include <Kokkos_Concepts.hpp>                   // is_iteration_pattern
+#include <traits/Kokkos_PolicyTraitAdaptor.hpp>  // TraitSpecificationBase
+#include <Kokkos_Rank.hpp>                       // Rank
+#include <Kokkos_Layout.hpp>                     // Iterate
+#include <type_traits>                           // is_void
 
 namespace Kokkos {
 namespace Impl {
@@ -54,32 +57,42 @@ namespace Impl {
 //==============================================================================
 // <editor-fold desc="trait specification"> {{{1
 
+template <class T>
+struct show_extra_iteration_pattern_erroneously_given_to_execution_policy;
+template <>
+struct show_extra_iteration_pattern_erroneously_given_to_execution_policy<
+    void> {};
 struct IterationPatternTrait : TraitSpecificationBase<IterationPatternTrait> {
   struct base_traits {
     using iteration_pattern = void;  // TODO set default iteration pattern
   };
-  template <class T>
-  using trait_matches_specification = is_iteration_pattern<T>;
+  template <class IterPattern, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+    static constexpr auto show_iteration_pattern_error_in_compilation_message =
+        show_extra_iteration_pattern_erroneously_given_to_execution_policy<
+            typename base_t::iteration_pattern>{};
+    static_assert(
+        std::is_void<typename base_t::iteration_pattern>::value,
+        "Kokkos Error: More than one index type given. Search "
+        "compiler output for 'show_extra_iteration_pattern' to see the "
+        "type of the errant tag.");
+    using iteration_pattern = IterPattern;
+  };
 };
 
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
 //==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+// <editor-fold desc="PolicyTraitMatcher specialization"> {{{1
 
-template <class IterationPattern, class... Traits>
-struct AnalyzeExecPolicy<
-    std::enable_if_t<is_iteration_pattern<IterationPattern>::value>,
-    IterationPattern, Traits...> : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-  static_assert(std::is_void<typename base_t::iteration_pattern>::value,
-                "Kokkos Error: More than one iteration pattern given");
-  using iteration_pattern = IterationPattern;
-};
+template <unsigned N, Iterate OuterDir, Iterate InnerDir>
+struct PolicyTraitMatcher<IterationPatternTrait, Rank<N, OuterDir, InnerDir>>
+    : std::true_type {};
 
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+// </editor-fold> end  }}}1
 //==============================================================================
 
 }  // end namespace Impl
diff --git a/packages/kokkos/core/src/traits/Kokkos_LaunchBoundsTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_LaunchBoundsTrait.hpp
index 73ae8e27e2eca54412b4cbab464b1760c93d7aed..06836bef8bff6ffc19d470766e0caa0a739a43c2 100644
--- a/packages/kokkos/core/src/traits/Kokkos_LaunchBoundsTrait.hpp
+++ b/packages/kokkos/core/src/traits/Kokkos_LaunchBoundsTrait.hpp
@@ -62,29 +62,33 @@ struct LaunchBoundsTrait : TraitSpecificationBase<LaunchBoundsTrait> {
 
     using launch_bounds = LaunchBounds<>;
   };
-  template <class T>
-  using trait_matches_specification = is_launch_bounds<T>;
+  template <class LaunchBoundParam, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+
+    static constexpr bool launch_bounds_is_defaulted = false;
+
+    static_assert(base_t::launch_bounds_is_defaulted,
+                  "Kokkos Error: More than one launch_bounds given");
+
+    using launch_bounds = LaunchBoundParam;
+  };
 };
 
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
 //==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+// <editor-fold desc="PolicyTraitMatcher specialization"> {{{1
 
-template <unsigned int MaxT, unsigned int MinB, class... Traits>
-struct AnalyzeExecPolicy<void, Kokkos::LaunchBounds<MaxT, MinB>, Traits...>
-    : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-  static_assert(base_t::launch_bounds_is_defaulted,
-                "Kokkos Error: More than one launch_bounds given");
-  static constexpr bool launch_bounds_is_defaulted = false;
-  using launch_bounds = Kokkos::LaunchBounds<MaxT, MinB>;
-};
+template <unsigned int maxT, unsigned int minB>
+struct PolicyTraitMatcher<LaunchBoundsTrait, LaunchBounds<maxT, minB>>
+    : std::true_type {};
 
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+// </editor-fold> end PolicyTraitMatcher specialization }}}1
 //==============================================================================
+
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/packages/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp
index 3deb4a94d54ddeee0a6a0712f107d61674818668..73be14cf8501b3c3bff4a2386e2f75e1ffb00f19 100644
--- a/packages/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp
+++ b/packages/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp
@@ -82,6 +82,9 @@ struct MaximizeOccupancy {
 
 namespace Impl {
 
+template <class Policy, class AnalyzeNextTrait>
+struct OccupancyControlPolicyMixin;
+
 //==============================================================================
 // <editor-fold desc="Occupancy control trait specification"> {{{1
 
@@ -94,6 +97,9 @@ struct OccupancyControlTrait : TraitSpecificationBase<OccupancyControlTrait> {
       return occupancy_control{};
     }
   };
+  template <class OccControl, class AnalyzeNextTrait>
+  using mixin_matching_trait =
+      OccupancyControlPolicyMixin<OccControl, AnalyzeNextTrait>;
   template <class T>
   using trait_matches_specification = std::integral_constant<
       bool,
@@ -105,39 +111,33 @@ struct OccupancyControlTrait : TraitSpecificationBase<OccupancyControlTrait> {
 //==============================================================================
 
 //==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
-
-// The DesiredOccupancy case has runtime storage, so we need to handle copies
-// and assignments
-template <class... Traits>
-struct AnalyzeExecPolicy<void, Kokkos::Experimental::DesiredOccupancy,
-                         Traits...> : AnalyzeExecPolicy<void, Traits...> {
- public:
-  using base_t            = AnalyzeExecPolicy<void, Traits...>;
+// <editor-fold desc="OccupancyControlPolicyMixin specializations"> {{{1
+
+template <class AnalyzeNextTrait>
+struct OccupancyControlPolicyMixin<Kokkos::Experimental::DesiredOccupancy,
+                                   AnalyzeNextTrait> : AnalyzeNextTrait {
+  using base_t            = AnalyzeNextTrait;
   using occupancy_control = Kokkos::Experimental::DesiredOccupancy;
   static constexpr bool experimental_contains_desired_occupancy = true;
 
-  template <class OccControl>
-  using with_occupancy_control = AnalyzeExecPolicy<void, OccControl, Traits...>;
-
   // Treat this as private, but make it public so that MSVC will still treat
   // this as a standard layout class and make it the right size: storage for a
   // stateful desired occupancy
   //   private:
-  occupancy_control m_desired_occupancy;
+  occupancy_control m_desired_occupancy = occupancy_control{};
 
-  AnalyzeExecPolicy() = default;
+  OccupancyControlPolicyMixin() = default;
   // Converting constructor
   // Just rely on the convertibility of occupancy_control to transfer the data
   template <class Other>
-  AnalyzeExecPolicy(ExecPolicyTraitsWithDefaults<Other> const& other)
+  OccupancyControlPolicyMixin(ExecPolicyTraitsWithDefaults<Other> const& other)
       : base_t(other),
         m_desired_occupancy(other.impl_get_occupancy_control()) {}
 
   // Converting assignment operator
   // Just rely on the convertibility of occupancy_control to transfer the data
   template <class Other>
-  AnalyzeExecPolicy& operator=(
+  OccupancyControlPolicyMixin& operator=(
       ExecPolicyTraitsWithDefaults<Other> const& other) {
     *static_cast<base_t*>(this) = other;
     this->impl_set_desired_occupancy(
@@ -160,16 +160,16 @@ struct AnalyzeExecPolicy<void, Kokkos::Experimental::DesiredOccupancy,
   }
 };
 
-template <class... Traits>
-struct AnalyzeExecPolicy<void, Kokkos::Experimental::MaximizeOccupancy,
-                         Traits...> : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
+template <class AnalyzeNextTrait>
+struct OccupancyControlPolicyMixin<Kokkos::Experimental::MaximizeOccupancy,
+                                   AnalyzeNextTrait> : AnalyzeNextTrait {
+  using base_t = AnalyzeNextTrait;
   using base_t::base_t;
   using occupancy_control = Kokkos::Experimental::MaximizeOccupancy;
   static constexpr bool experimental_contains_desired_occupancy = false;
 };
 
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+// </editor-fold> end OccupancyControlPolicyMixin specializations }}}1
 //==============================================================================
 
 }  // end namespace Impl
diff --git a/packages/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp b/packages/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp
index b087dac85559bd6dc67c983bdaad1a6675cfde9b..e500dd4e831abaa03479e9fb1e2fb67595107a9e 100644
--- a/packages/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp
+++ b/packages/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp
@@ -73,7 +73,7 @@ namespace Impl {
 // something that we can default to in the unspecialized case, just like we
 // do for AnalyzeExecPolicy
 template <class TraitSpec, class Trait, class Enable = void>
-struct PolicyTraitMatcher;
+struct PolicyTraitMatcher : std::false_type {};
 
 template <class TraitSpec, class Trait>
 struct PolicyTraitMatcher<
diff --git a/packages/kokkos/core/src/traits/Kokkos_PolicyTraitMatcher.hpp b/packages/kokkos/core/src/traits/Kokkos_PolicyTraitMatcher.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..31927320bf6fe77cc133e49a80e7f741574165a8
--- /dev/null
+++ b/packages/kokkos/core/src/traits/Kokkos_PolicyTraitMatcher.hpp
@@ -0,0 +1,77 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <impl/Kokkos_Utilities.hpp>  // type_list
+
+#include <traits/Kokkos_Traits_fwd.hpp>
+
+#ifndef KOKKOS_KOKKOS_POLICYTRAITMATCHER_HPP
+#define KOKKOS_KOKKOS_POLICYTRAITMATCHER_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+//==============================================================================
+// <editor-fold desc="PolicyTraitMatcher"> {{{1
+
+// To handle the WorkTag case, we need more than just a predicate; we need
+// something that we can default to in the unspecialized case, just like we
+// do for AnalyzeExecPolicy
+template <class TraitSpec, class Trait, class Enable = void>
+struct PolicyTraitMatcher : std::false_type {};
+
+template <class TraitSpec, class Trait>
+struct PolicyTraitMatcher<
+    TraitSpec, Trait,
+    std::enable_if_t<
+        TraitSpec::template trait_matches_specification<Trait>::value>>
+    : std::true_type {};
+
+// </editor-fold> end PolicyTraitMatcher }}}1
+//==============================================================================
+
+}  // end namespace Impl
+}  // end namespace Kokkos
+
+#endif  // KOKKOS_KOKKOS_POLICYTRAITMATCHER_HPP
diff --git a/packages/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp
index 74bab6fce2a632269a804971af3e50348e34c8b2..3e578f9060ab22b8707adc8797d401226a52ff44 100644
--- a/packages/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp
+++ b/packages/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp
@@ -57,34 +57,43 @@ namespace Impl {
 //==============================================================================
 // <editor-fold desc="trait specification"> {{{1
 
+template <class T>
+struct show_extra_schedule_type_erroneously_given_to_execution_policy;
+template <>
+struct show_extra_schedule_type_erroneously_given_to_execution_policy<void> {};
 struct ScheduleTrait : TraitSpecificationBase<ScheduleTrait> {
   struct base_traits {
     static constexpr auto schedule_type_is_defaulted = true;
 
     using schedule_type = Schedule<Static>;
   };
-  template <class T>
-  using trait_matches_specification = is_schedule_type<T>;
+  template <class Sched, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+    using schedule_type = Sched;
+    static constexpr auto show_schedule_type_error_in_compilation_message =
+        show_extra_schedule_type_erroneously_given_to_execution_policy<
+            std::conditional_t<base_t::schedule_type_is_defaulted, void,
+                               typename base_t::schedule_type>>{};
+    static_assert(base_t::schedule_type_is_defaulted,
+                  "Kokkos Error: More than one schedule type given. Search "
+                  "compiler output for 'show_extra_schedule_type' to see the "
+                  "type of the errant tag.");
+    static constexpr bool schedule_type_is_defaulted = false;
+  };
 };
 
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
 //==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
-
-template <class ScheduleType, class... Traits>
-struct AnalyzeExecPolicy<void, Kokkos::Schedule<ScheduleType>, Traits...>
-    : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-  static_assert(base_t::schedule_type_is_defaulted,
-                "Kokkos Error: More than one schedule type given");
-  static constexpr bool schedule_type_is_defaulted = false;
-  using schedule_type = Kokkos::Schedule<ScheduleType>;
-};
+// <editor-fold desc="PolicyTraitMatcher specialization"> {{{1
+
+template <class Sched>
+struct PolicyTraitMatcher<ScheduleTrait, Schedule<Sched>> : std::true_type {};
 
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+// </editor-fold> end PolicyTraitMatcher specialization }}}1
 //==============================================================================
 
 }  // end namespace Impl
diff --git a/packages/kokkos/core/src/traits/Kokkos_Traits_fwd.hpp b/packages/kokkos/core/src/traits/Kokkos_Traits_fwd.hpp
index b8b9a0ca2d889b08116528803d0c1b096060ecad..b8289ca6188846884277ca514db453145f1cb3c6 100644
--- a/packages/kokkos/core/src/traits/Kokkos_Traits_fwd.hpp
+++ b/packages/kokkos/core/src/traits/Kokkos_Traits_fwd.hpp
@@ -51,9 +51,15 @@ namespace Impl {
 template <class Enable, class... TraitsList>
 struct AnalyzeExecPolicy;
 
+template <class Enable, class TraitSpecList, class... Traits>
+struct AnalyzeExecPolicyUseMatcher;
+
 template <class AnalysisResults>
 struct ExecPolicyTraitsWithDefaults;
 
+template <class TraitSpec, class Trait, class Enable>
+struct PolicyTraitMatcher;
+
 template <class TraitSpec, template <class...> class PolicyTemplate,
           class AlreadyProcessedList, class ToProcessList, class NewTrait,
           class Enable = void>
@@ -67,6 +73,40 @@ struct PolicyTraitAdaptor;
 // traits
 struct dependent_policy_trait_default;
 
+//==============================================================================
+// <editor-fold desc="Execution policy trait specifications"> {{{1
+
+struct ExecutionSpaceTrait;
+struct IndexTypeTrait;
+struct ScheduleTrait;
+struct IterationPatternTrait;
+struct WorkItemPropertyTrait;
+struct LaunchBoundsTrait;
+struct OccupancyControlTrait;
+struct GraphKernelTrait;
+struct WorkTagTrait;
+
+// Keep these sorted by frequency of use to reduce compilation time
+//
+// clang-format off
+using execution_policy_trait_specifications =
+  type_list<
+    ExecutionSpaceTrait,
+    IndexTypeTrait,
+    ScheduleTrait,
+    IterationPatternTrait,
+    WorkItemPropertyTrait,
+    LaunchBoundsTrait,
+    OccupancyControlTrait,
+    GraphKernelTrait,
+    // This one has to be last, unfortunately:
+    WorkTagTrait
+  >;
+// clang-format on
+
+// </editor-fold> end Execution policy trait specifications }}}1
+//==============================================================================
+
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/packages/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp
index 2656316fb934333655d0370f4dc3d40eea7bbb86..35671d19b02bb72c777b77717beced94d152beb3 100644
--- a/packages/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp
+++ b/packages/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp
@@ -60,6 +60,12 @@ struct WorkItemPropertyTrait : TraitSpecificationBase<WorkItemPropertyTrait> {
   struct base_traits {
     using work_item_property = Kokkos::Experimental::WorkItemProperty::None_t;
   };
+  template <class WorkItemProp, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+    using work_item_property = WorkItemProp;
+  };
   template <class T>
   using trait_matches_specification =
       Kokkos::Experimental::is_work_item_property<T>;
@@ -68,26 +74,6 @@ struct WorkItemPropertyTrait : TraitSpecificationBase<WorkItemPropertyTrait> {
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
-//==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
-
-template <class Property, class... Traits>
-struct AnalyzeExecPolicy<
-    std::enable_if_t<
-        Kokkos::Experimental::is_work_item_property<Property>::value>,
-    Property, Traits...> : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-  static_assert(
-      std::is_same<typename base_t::work_item_property,
-                   Kokkos::Experimental::WorkItemProperty::None_t>::value,
-      "Kokkos Error: More than one work item property given");
-  using work_item_property = Property;
-};
-
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
-//==============================================================================
-
 }  // end namespace Impl
 
 namespace Experimental {
diff --git a/packages/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp
index 877005756a703b067c07c6f57c3fc4212f7484ca..424e5c405b70cff9f73ef5756b5dca41e9d3d618 100644
--- a/packages/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp
+++ b/packages/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp
@@ -49,6 +49,7 @@
 #include <Kokkos_Concepts.hpp>  // is_execution_space
 #include <traits/Kokkos_PolicyTraitAdaptor.hpp>
 #include <traits/Kokkos_Traits_fwd.hpp>
+#include <impl/Kokkos_Utilities.hpp>  // type_list_any, type_list_remove_first
 
 namespace Kokkos {
 namespace Impl {
@@ -56,68 +57,65 @@ namespace Impl {
 //==============================================================================
 // <editor-fold desc="trait specification"> {{{1
 
+template <class T>
+struct show_extra_work_tag_erroneously_given_to_execution_policy;
+template <>
+struct show_extra_work_tag_erroneously_given_to_execution_policy<void> {};
+
+using _exec_policy_traits_without_work_tag = typename type_list_remove_first<
+    WorkTagTrait, execution_policy_trait_specifications>::type;
+
+template <class Trait>
+struct _trait_matches_spec_predicate {
+  template <class TraitSpec>
+  struct apply {
+    using type = typename PolicyTraitMatcher<TraitSpec, Trait>::type;
+    static constexpr bool value = type::value;
+  };
+};
+
 struct WorkTagTrait : TraitSpecificationBase<WorkTagTrait> {
   struct base_traits {
     using work_tag = void;
   };
+  template <class WorkTag, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+    using work_tag = WorkTag;
+    static constexpr auto show_work_tag_error_in_compilation_message =
+        show_extra_work_tag_erroneously_given_to_execution_policy<
+            typename base_t::work_tag>{};
+    static_assert(
+        std::is_void<typename base_t::work_tag>::value,
+        "Kokkos Error: More than one work tag given. Search compiler output "
+        "for 'show_extra_work_tag' to see the type of the errant tag.");
+  };
+  // Since we don't have subsumption in pre-C++20, we need to have the work tag
+  // "trait" handling code ensure that none of the other conditions are met.
+  // * Compile time cost complexity note: at first glance it looks like this
+  //   "rechecks" all of the other trait specs when used in the context of the
+  //   full list of execution policy traits, but actually since we've already
+  //   checked all of them to get to the end of the list, the compiler will
+  //   have already generated those definitions, so there should be little extra
+  //   cost to this. However, in the scenario where we use work tag in isolation
+  //   (like if we were to add a `require()`-like thing that changes the work
+  //   tag of an existing execution policy instance), we need to check all of
+  //   the other traits to make sure that we're not replacing something else,
+  //   given that the concept of a work tag is basically unconstrained and could
+  //   be anything.  This should still be as efficient at compile time as the
+  //   old code that just did a big long series of nested std::conditionals, but
+  //   we should benchmark this assumption if it becomes a problem.
+  template <class T>
+  using trait_matches_specification = std::integral_constant<
+      bool, !std::is_void<T>::value &&
+                !type_list_any<_trait_matches_spec_predicate<T>::template apply,
+                               _exec_policy_traits_without_work_tag>::value>;
 };
 
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
-//==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
-
-// Since we don't have subsumption in pre-C++20, we need to have the work tag
-// "trait" handling code be unspecialized, so we handle it instead in a class
-// with a different name.
-template <class... Traits>
-struct AnalyzeExecPolicyHandleWorkTag : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-};
-
-template <class WorkTag, class... Traits>
-struct AnalyzeExecPolicyHandleWorkTag<WorkTag, Traits...>
-    : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-  static_assert(std::is_void<typename base_t::work_tag>::value,
-                "Kokkos Error: More than one work tag given");
-  using work_tag = WorkTag;
-};
-
-// This only works if this is not a partial specialization, so we have to
-// do the partial specialization elsewhere
-template <class Enable, class... Traits>
-struct AnalyzeExecPolicy : AnalyzeExecPolicyHandleWorkTag<Traits...> {
-  using base_t = AnalyzeExecPolicyHandleWorkTag<Traits...>;
-  using base_t::base_t;
-};
-
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
-//==============================================================================
-
-//==============================================================================
-// <editor-fold desc="PolicyTraitMatcher specializations"> {{{1
-
-// In order to match the work tag trait the work tag "matcher" needs to be
-// unspecialized and the logic needs to be handled in a differently-named class,
-// just like above.
-template <class TraitSpec, class Trait>
-struct PolicyTraitMatcherHandleWorkTag : std::false_type {};
-
-template <class Trait>
-struct PolicyTraitMatcherHandleWorkTag<WorkTagTrait, Trait>
-    : std::integral_constant<bool, !std::is_void<Trait>::value> {};
-
-template <class TraitSpec, class Trait, class Enable>
-struct PolicyTraitMatcher /* unspecialized! */
-    : PolicyTraitMatcherHandleWorkTag<TraitSpec, Trait> {};
-
-// </editor-fold> end PolicyTraitMatcher specializations }}}1
-//==============================================================================
-
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/packages/kokkos/core/unit_test/CMakeLists.txt b/packages/kokkos/core/unit_test/CMakeLists.txt
index 5826208851090933ee296988287a6a633eb2c476..89b8ff1e4f0a8004ecd4c2f06d72544123107d03 100644
--- a/packages/kokkos/core/unit_test/CMakeLists.txt
+++ b/packages/kokkos/core/unit_test/CMakeLists.txt
@@ -41,10 +41,10 @@ SET(KOKKOS_OPENMP_FEATURE_LEVEL 999)
 SET(KOKKOS_OPENMP_NAME OpenMP)
 
 # FIXME_OPENMPTARGET - The NVIDIA HPC compiler nvc++ only compiles the first 8 incremental tests for the OpenMPTarget backend.
-IF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
-  SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 8)
+IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
+  SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 10)
 ELSE()
-  SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 13)
+  SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 14)
 ENDIF()
 
 SET(KOKKOS_OPENMPTARGET_NAME Experimental::OpenMPTarget)
@@ -65,6 +65,21 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
 KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
 KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files)
 
+SET(COMPILE_ONLY_SOURCES
+  TestDetectionIdiom.cpp
+  TestInterOp.cpp
+  TestTypeList.cpp
+)
+# TestInterOp has a dependency on containers
+IF(KOKKOS_HAS_TRILINOS)
+  LIST(REMOVE_ITEM COMPILE_ONLY_SOURCES TestInterOp.cpp)
+ENDIF()
+KOKKOS_ADD_EXECUTABLE(
+  TestCompileOnly
+  SOURCES
+  ${COMPILE_ONLY_SOURCES}
+)
+
 foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
   # Because there is always an exception to the rule
   if(Tag STREQUAL "Threads")
@@ -98,6 +113,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
         Complex
         Crs
         DeepCopyAlignment
+        ExecutionSpace
         FunctorAnalysis
         Init
         LocalDeepCopy
@@ -107,6 +123,9 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
         MDRange_c
         HostSharedPtr
         HostSharedPtrAccessOnDevice
+        QuadPrecisionMath
+        ExecSpacePartitioning
+        MathematicalSpecialFunctions
         )
       set(file ${dir}/Test${Tag}_${Name}.cpp)
       # Write to a temporary intermediate file and call configure_file to avoid
@@ -190,7 +209,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
     elseif(Tag STREQUAL "HIP")
       set(TagHostAccessible HIPHostPinned)
     elseif(Tag STREQUAL "SYCL")
-      set(TagHostAccessible SYCLSharedUSMSpace)
+      set(TagHostAccessible SYCLSharedUSM)
     endif()
 
     set(${Tag}_SOURCES2B)
@@ -257,6 +276,43 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
   endif()
 endforeach()
 
+foreach(PairDeviceSpace HIP-HostPinned;Cuda-HostPinned;Cuda-UVM;SYCL-HostUSM;SYCL-SharedUSM)
+  string(REGEX REPLACE "([^-]*)-(.*)" "\\1" DEVICE ${PairDeviceSpace})
+  string(REGEX REPLACE "([^-]*)-(.*)" "\\2" SPACE ${PairDeviceSpace})
+
+  string(TOUPPER ${DEVICE} UPPER_DEVICE)
+  string(TOLOWER ${DEVICE} dir)
+
+  if(Kokkos_ENABLE_${UPPER_DEVICE})
+    set(dir ${CMAKE_CURRENT_BINARY_DIR}/${dir})
+    file(MAKE_DIRECTORY ${dir})
+    foreach(Name
+      SharedAlloc
+      ViewAPI_a
+      ViewAPI_b
+      ViewAPI_c
+      ViewAPI_d
+      ViewAPI_e
+      ViewCopy_a
+      ViewCopy_b
+      ViewMapping_a
+      ViewMapping_b
+      ViewMapping_subview
+      )
+      set(file ${dir}/Test${DEVICE}${SPACE}_${Name}.cpp)
+      # Write to a temporary intermediate file and call configure_file to avoid
+      # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs.
+      file(WRITE ${dir}/dummy.cpp
+          "#include <Test${DEVICE}${SPACE}_Category.hpp>\n"
+          "#include <Test${Name}.hpp>\n"
+      )
+      configure_file(${dir}/dummy.cpp ${file})
+      list(APPEND ${DEVICE}_SOURCES3 ${file})
+    endforeach()
+    list(APPEND ${DEVICE}_SOURCES ${${DEVICE}_SOURCES3})
+  endif()
+endforeach()
+
 if(Kokkos_ENABLE_OPENMPTARGET)
   list(REMOVE_ITEM OpenMPTarget_SOURCES
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_complexfloat.cpp
@@ -264,9 +320,7 @@ if(Kokkos_ENABLE_OPENMPTARGET)
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Crs.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_LocalDeepCopy.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Other.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reductions_DeviceView.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamReductionScan.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamTeamSize.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamScan.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_e.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewCopy_a.cpp
@@ -278,9 +332,16 @@ if(Kokkos_ENABLE_OPENMPTARGET)
 endif()
 
 # FIXME_OPENMPTARGET - Comment non-passing tests with the NVIDIA HPC compiler nvc++
-IF(KOKKOS_ENABLE_OPENMPTARGET
-   AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
+IF(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
   list(REMOVE_ITEM OpenMPTarget_SOURCES
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_int64_t_reduce.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_int64_t_reduce_dynamic.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_int64_t_reduce_dynamic_view.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_double_reduce.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_double_reduce_dynamic.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamTeamSize.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reductions_DeviceView.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_UniqueToken.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_HostSharedPtr.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_HostSharedPtrAccessOnDevice.cpp
@@ -370,14 +431,19 @@ if(Kokkos_ENABLE_PTHREAD)
   )
 endif()
 
-if(Kokkos_ENABLE_OPENMP)
+if (Kokkos_ENABLE_OPENMP)
+  set(OpenMP_EXTRA_SOURCES
+    openmp/TestOpenMP_Task.cpp
+  )
+  if (Kokkos_ENABLE_DEPRECATED_CODE_3)
+    list(APPEND OpenMP_EXTRA_SOURCES openmp/TestOpenMP_Task.cpp)
+  endif ()
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
     UnitTest_OpenMP
     SOURCES
     UnitTestMainInit.cpp
     ${OpenMP_SOURCES}
-    openmp/TestOpenMP_PartitionMaster.cpp
-    openmp/TestOpenMP_Task.cpp
+    ${OpenMP_EXTRA_SOURCES}
   )
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
     UnitTest_OpenMPInterOp
@@ -463,28 +529,7 @@ if(Kokkos_ENABLE_CUDA)
       UnitTestMainInit.cpp
       cuda/TestCuda_Task.cpp
       cuda/TestCuda_TeamScratchStreams.cpp
-      cuda/TestCudaHostPinned_SharedAlloc.cpp
-      cuda/TestCudaHostPinned_ViewAPI_a.cpp
-      cuda/TestCudaHostPinned_ViewAPI_b.cpp
-      cuda/TestCudaHostPinned_ViewAPI_c.cpp
-      cuda/TestCudaHostPinned_ViewAPI_d.cpp
-      cuda/TestCudaHostPinned_ViewAPI_e.cpp
-      cuda/TestCudaHostPinned_ViewCopy_a.cpp
-      cuda/TestCudaHostPinned_ViewCopy_b.cpp
-      cuda/TestCudaHostPinned_ViewMapping_a.cpp
-      cuda/TestCudaHostPinned_ViewMapping_b.cpp
-      cuda/TestCudaHostPinned_ViewMapping_subview.cpp
-      cuda/TestCudaUVM_SharedAlloc.cpp
-      cuda/TestCudaUVM_ViewAPI_a.cpp
-      cuda/TestCudaUVM_ViewAPI_b.cpp
-      cuda/TestCudaUVM_ViewAPI_c.cpp
-      cuda/TestCudaUVM_ViewAPI_d.cpp
-      cuda/TestCudaUVM_ViewAPI_e.cpp
-      cuda/TestCudaUVM_ViewCopy_a.cpp
-      cuda/TestCudaUVM_ViewCopy_b.cpp
-      cuda/TestCudaUVM_ViewMapping_a.cpp
-      cuda/TestCudaUVM_ViewMapping_b.cpp
-      cuda/TestCudaUVM_ViewMapping_subview.cpp
+      ${Cuda_SOURCES3}
       cuda/TestCuda_Spaces.cpp
   )
 
@@ -524,17 +569,8 @@ if(Kokkos_ENABLE_HIP)
       ${HIP_SOURCES}
       hip/TestHIP_ScanUnit.cpp
       hip/TestHIP_TeamScratchStreams.cpp
-      hip/TestHIPHostPinned_ViewAPI_a.cpp
-      hip/TestHIPHostPinned_ViewAPI_b.cpp
-      hip/TestHIPHostPinned_ViewAPI_c.cpp
-      hip/TestHIPHostPinned_ViewAPI_d.cpp
-      hip/TestHIPHostPinned_ViewAPI_e.cpp
-      hip/TestHIPHostPinned_ViewCopy_a.cpp
-      hip/TestHIPHostPinned_ViewCopy_b.cpp
-      hip/TestHIPHostPinned_ViewMapping_a.cpp
-      hip/TestHIPHostPinned_ViewMapping_b.cpp
-      hip/TestHIPHostPinned_ViewMapping_subview.cpp
       hip/TestHIP_AsyncLauncher.cpp
+      hip/TestHIP_BlocksizeDeduction.cpp
   )
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
     UnitTest_HIPInterOpInit
@@ -595,13 +631,25 @@ if(Kokkos_ENABLE_SYCL)
       ${SYCL_SOURCES2C}
   )
 
- KOKKOS_ADD_EXECUTABLE_AND_TEST(
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
     UnitTest_SYCL2D
     SOURCES
       UnitTestMainInit.cpp
       ${SYCL_SOURCES2D}
   )
- KOKKOS_ADD_EXECUTABLE_AND_TEST(
+
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_SYCL3
+    SOURCES
+      UnitTestMainInit.cpp
+      # FIXME_SYCL
+      sycl/TestSYCL_Task.cpp
+      sycl/TestSYCL_TeamScratchStreams.cpp
+      ${SYCL_SOURCES3}
+      sycl/TestSYCL_Spaces.cpp
+  )
+
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
     UnitTest_SYCLInterOpInit
     SOURCES
       UnitTestMain.cpp
@@ -622,8 +670,7 @@ if(Kokkos_ENABLE_SYCL)
 endif()
 
 # FIXME_OPENMPTARGET - Comment non-passing tests with the NVIDIA HPC compiler nvc++
-if (KOKKOS_ENABLE_OPENMPTARGET
-    AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
+if (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
   SET(DEFAULT_DEVICE_SOURCES
     UnitTestMainInit.cpp
     default/TestDefaultDeviceType.cpp
@@ -685,11 +732,21 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
 )
 
   if(KOKKOS_ENABLE_TUNING)
+    KOKKOS_ADD_EXECUTABLE_AND_TEST(
+      UnitTest_TuningBuiltins
+      SOURCES
+      tools/TestBuiltinTuners.cpp
+    )
     KOKKOS_ADD_EXECUTABLE_AND_TEST(
       UnitTest_TuningBasics
       SOURCES
         tools/TestTuning.cpp
     )
+    KOKKOS_ADD_EXECUTABLE_AND_TEST(
+      UnitTest_CategoricalTuner
+      SOURCES
+      tools/TestCategoricalTuner.cpp
+    )
   endif()
   if(NOT Kokkos_ENABLE_OPENMPTARGET)
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
@@ -698,6 +755,11 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
       tools/TestLogicalSpaces.cpp
   )
   endif()
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_EventCorrectness
+    SOURCES
+    tools/TestEventCorrectness.cpp
+  )
   if(KOKKOS_ENABLE_LIBDL)
 
     KOKKOS_ADD_TEST_LIBRARY(
@@ -745,7 +807,7 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
       EXE  ProfilingAllCalls
       TOOL kokkosprinter-tool
       ARGS --kokkos-tools-args="-c test delimit"
-      PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source]:0:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination]:0:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::"
+      PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::"
     )
 
     # Above will test that leading/trailing quotes are stripped bc ctest cmd args is:
@@ -762,7 +824,7 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
       EXE  ProfilingAllCalls
       ARGS [=[--kokkos-tools-args=-c test delimit]=]
             --kokkos-tools-library=$<TARGET_FILE:kokkosprinter-tool>
-      PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source]:0:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination]:0:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::"
+      PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::"
     )
   endif() #KOKKOS_ENABLE_LIBDL
 if(NOT KOKKOS_HAS_TRILINOS)
diff --git a/packages/kokkos/core/unit_test/Makefile b/packages/kokkos/core/unit_test/Makefile
index 390fc79a4755e46cbd61b28ee54d44814fa501d9..422628221402586ec4829ad5d8b628cbdd3736b1 100644
--- a/packages/kokkos/core/unit_test/Makefile
+++ b/packages/kokkos/core/unit_test/Makefile
@@ -73,6 +73,8 @@ tmp := $(foreach device, $(KOKKOS_DEVICELIST), \
   ) \
 )
 
+GPU_SPACE_TESTS = SharedAlloc ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewMapping_a ViewMapping_b ViewMapping_subview
+
 SUBVIEW_TESTS = SubView_a SubView_b SubView_c01 SubView_c02 SubView_c03 SubView_c04 SubView_c05 SubView_c06 SubView_c07 SubView_c08 SubView_c09 SubView_c10 SubView_c11 SubView_c12 SubView_c13
 
 KOKKOS_SUBVIEW_DEVICELIST := $(filter-out Cuda, $(KOKKOS_DEVICELIST))
@@ -94,6 +96,16 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
       )\
     )
 
+    GPU_SPACES = CudaHostPinned CudaUVM
+    tmp := $(foreach space, $(GPU_SPACES), \
+      tmp2 := $(foreach test, $(GPU_SPACE_TESTS), \
+        $(if $(filter Test$(space)_$(test).cpp, $(shell ls Test$(space)_$(test).cpp 2>/dev/null)),,\
+          $(shell echo "\#include <Test$(space)_Category.hpp>" > Test$(space)_$(test).cpp); \
+          $(shell echo "\#include <Test"$(test)".hpp>" >> Test$(space)_$(test).cpp); \
+        )\
+      )\
+    )
+
     OBJ_CUDA = UnitTestMainInit.o gtest-all.o
     OBJ_CUDA += TestCuda_Init.o
     OBJ_CUDA += TestCuda_SharedAlloc.o TestCudaUVM_SharedAlloc.o TestCudaHostPinned_SharedAlloc.o
@@ -261,6 +273,16 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
+	GPU_SPACES = HIPHostPinned
+	tmp := $(foreach space, $(GPU_SPACES), \
+	  tmp2 := $(foreach test, $(GPU_SPACE_TESTS), \
+	    $(if $(filter Test$(space)_$(test).cpp, $(shell ls Test$(space)_$(test).cpp 2>/dev/null)),,\
+	      $(shell echo "\#include <Test$(space)_Category.hpp>" > Test$(space)_$(test).cpp); \
+	      $(shell echo "\#include <Test"$(test)".hpp>" >> Test$(space)_$(test).cpp); \
+	    )\
+	  )\
+	)
+
 	OBJ_HIP = UnitTestMainInit.o gtest-all.o
 	OBJ_HIP += TestHIP_Init.o
 	OBJ_HIP += TestHIP_Reducers_a.o TestHIP_Reducers_b.o TestHIP_Reducers_c.o TestHIP_Reducers_d.o
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations.hpp
index 04362125c0648e679f9a1cfb9886ccb84e6b14d5..257ad2e9e5bba73babacd0153ba74f0ab1a2ba15 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations.hpp
@@ -81,6 +81,56 @@ struct InitFunctor {
   InitFunctor(T _init_value) : init_value(_init_value) {}
 };
 
+//---------------------------------------------------
+//--------------atomic_load/store/assign---------------------
+//---------------------------------------------------
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+template <class T, class DEVICE_TYPE>
+struct LoadStoreFunctor {
+  using execution_space = DEVICE_TYPE;
+  using type            = Kokkos::View<T, execution_space>;
+
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    T old = Kokkos::atomic_load(&data());
+    if (old != i0)
+      Kokkos::abort("Kokkos Atomic Load didn't get the right value");
+    Kokkos::atomic_store(&data(), i1);
+    Kokkos::atomic_assign(&data(), old);
+  }
+  LoadStoreFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
+};
+#endif
+
+template <class T, class DeviceType>
+bool LoadStoreAtomicTest(T i0, T i1) {
+  using execution_space = typename DeviceType::execution_space;
+  struct InitFunctor<T, execution_space> f_init(i0);
+  typename InitFunctor<T, execution_space>::type data("Data");
+  typename InitFunctor<T, execution_space>::h_type h_data("HData");
+
+  f_init.data = data;
+  Kokkos::parallel_for(1, f_init);
+  execution_space().fence();
+
+#ifdef KOKKOS_ENABLE_DESUL_ATOMICS
+  struct LoadStoreFunctor<T, execution_space> f(i0, i1);
+
+  f.data = data;
+  Kokkos::parallel_for(1, f);
+#else
+  h_data() = i1;
+#endif
+
+  Kokkos::deep_copy(h_data, data);
+
+  return h_data() == i0;
+}
+
 //---------------------------------------------------
 //--------------atomic_fetch_max---------------------
 //---------------------------------------------------
@@ -594,7 +644,10 @@ struct AndFunctor {
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const { Kokkos::atomic_fetch_and(&data(), (T)i1); }
+  void operator()(int) const {
+    T result = Kokkos::atomic_fetch_and(&data(), (T)i1);
+    Kokkos::atomic_and(&data(), result);
+  }
 
   AndFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
 };
@@ -665,7 +718,10 @@ struct OrFunctor {
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const { Kokkos::atomic_fetch_or(&data(), (T)i1); }
+  void operator()(int) const {
+    T result = Kokkos::atomic_fetch_or(&data(), (T)i1);
+    Kokkos::atomic_or(&data(), result);
+  }
 
   OrFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
 };
@@ -954,6 +1010,7 @@ bool AtomicOperationsTestIntegralType(int i0, int i1, int test) {
     case 10: return RShiftAtomicTest<T, DeviceType>((T)i0, (T)i1);
     case 11: return IncAtomicTest<T, DeviceType>((T)i0);
     case 12: return DecAtomicTest<T, DeviceType>((T)i0);
+    case 13: return LoadStoreAtomicTest<T, DeviceType>((T)i0, (T)i1);
   }
 
   return 0;
@@ -966,6 +1023,7 @@ bool AtomicOperationsTestNonIntegralType(int i0, int i1, int test) {
     case 2: return MinAtomicTest<T, DeviceType>((T)i0, (T)i1);
     case 3: return MulAtomicTest<T, DeviceType>((T)i0, (T)i1);
     case 4: return DivAtomicTest<T, DeviceType>((T)i0, (T)i1);
+    case 5: return LoadStoreAtomicTest<T, DeviceType>((T)i0, (T)i1);
   }
 
   return 0;
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_double.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_double.hpp
index ba9937e1c6643bfd8a4decde2c7823061b0fcbe4..303f5b6eb9f77c3767056ac2639122d4b01b9d7a 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations_double.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations_double.hpp
@@ -57,6 +57,8 @@ TEST(TEST_CATEGORY, atomic_operations_double) {
                  double, TEST_EXECSPACE>(start, end - i, 3)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
                  double, TEST_EXECSPACE>(start, end - i, 4)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
+                 double, TEST_EXECSPACE>(start, end - i, 5)));
   }
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_float.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_float.hpp
index aa56b5ff10e770d2964d498f99e25611d85311c6..d3d4916b4ea6d623b010834627f41ebf65161ff7 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations_float.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations_float.hpp
@@ -57,6 +57,8 @@ TEST(TEST_CATEGORY, atomic_operations_float) {
                  float, TEST_EXECSPACE>(start, end - i, 3)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
                  float, TEST_EXECSPACE>(start, end - i, 4)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
+                 float, TEST_EXECSPACE>(start, end - i, 5)));
   }
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_int.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_int.hpp
index f828be6223c7b4e7554252fd04927ce1a3fcb69a..e5f2f334fc2b07e24ed5f77d75a64947c2117e21 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations_int.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations_int.hpp
@@ -71,6 +71,8 @@ TEST(TEST_CATEGORY, atomic_operations_int) {
                  int, TEST_EXECSPACE>(start, end - i, 11)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
                  int, TEST_EXECSPACE>(start, end - i, 12)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                 int, TEST_EXECSPACE>(start, end - i, 13)));
   }
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_longint.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_longint.hpp
index eee44c9571cf890b25a9a1b9bb32edd279d3cae7..d4fda70e80cff156aa58bc0294c66be38729f745 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations_longint.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations_longint.hpp
@@ -71,6 +71,8 @@ TEST(TEST_CATEGORY, atomic_operations_long) {
                  long int, TEST_EXECSPACE>(start, end - i, 11)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
                  long int, TEST_EXECSPACE>(start, end - i, 12)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                 long int, TEST_EXECSPACE>(start, end - i, 13)));
   }
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp
index 73d4a61d7291f852d1ff2d6607d36a1d0bb2f829..b7fb0cdae5f99f6704dbdd18c2554bcb6b3e5ddf 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp
@@ -71,6 +71,8 @@ TEST(TEST_CATEGORY, atomic_operations_longlong) {
                  long long int, TEST_EXECSPACE>(start, end - i, 11)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
                  long long int, TEST_EXECSPACE>(start, end - i, 12)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                 long long int, TEST_EXECSPACE>(start, end - i, 13)));
   }
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp
index 02f337c57c64633d62d8111c28ae49cee05e80e3..c3c6bc9fb38d9dc9af37bc69c29e60d1fd040cc6 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp
@@ -71,6 +71,8 @@ TEST(TEST_CATEGORY, atomic_operations_unsigned) {
                  unsigned int, TEST_EXECSPACE>(start, end - i, 11)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
                  unsigned int, TEST_EXECSPACE>(start, end - i, 12)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                 unsigned int, TEST_EXECSPACE>(start, end - i, 13)));
   }
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp
index f4340475f573c3c8f4c108f8b7bfacef0d72af4e..f3be4bedb794884998639eb9a313db5079bebdd2 100644
--- a/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp
@@ -71,6 +71,8 @@ TEST(TEST_CATEGORY, atomic_operations_unsignedlong) {
                  unsigned long int, TEST_EXECSPACE>(start, end - i, 11)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
                  unsigned long int, TEST_EXECSPACE>(start, end - i, 12)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                 unsigned long int, TEST_EXECSPACE>(start, end - i, 13)));
   }
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestAtomicViews.hpp b/packages/kokkos/core/unit_test/TestAtomicViews.hpp
index b615b407f334a60d187bbc3c27b3c69110acc94c..e029ad81f576f25333470e6078eee9445abba3ec 100644
--- a/packages/kokkos/core/unit_test/TestAtomicViews.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomicViews.hpp
@@ -245,11 +245,11 @@ class TestAtomicViewAPI {
     ASSERT_EQ(ax.use_count(), size_t(4));
     ASSERT_EQ(const_ax.use_count(), ax.use_count());
 
-    ASSERT_FALSE(ax.data() == nullptr);
-    ASSERT_FALSE(const_ax.data() == nullptr);  // referenceable ptr
-    ASSERT_FALSE(unmanaged_ax.data() == nullptr);
-    ASSERT_FALSE(unmanaged_ax_from_ptr_dx.data() == nullptr);
-    ASSERT_FALSE(ay.data() == nullptr);
+    ASSERT_NE(ax.data(), nullptr);
+    ASSERT_NE(const_ax.data(), nullptr);  // referenceable ptr
+    ASSERT_NE(unmanaged_ax.data(), nullptr);
+    ASSERT_NE(unmanaged_ax_from_ptr_dx.data(), nullptr);
+    ASSERT_NE(ay.data(), nullptr);
     //    ASSERT_NE( ax, ay );
     //    Above test results in following runtime error from gtest:
     //    Expected: (ax) != (ay), actual: 32-byte object <30-01 D0-A0 D8-7F
@@ -278,7 +278,7 @@ class TestAtomicViewAPI {
                          Kokkos::MemoryTraits<Kokkos::Atomic> >& arg_const,
       const Kokkos::View<const DataType, device,
                          Kokkos::MemoryTraits<Kokkos::Atomic> >& arg) {
-    ASSERT_TRUE(arg_const == arg);
+    ASSERT_EQ(arg_const, arg);
   }
 
   static void run_test_const() {
@@ -290,8 +290,8 @@ class TestAtomicViewAPI {
     typeX x("X");
     const_typeX xc = x;
 
-    // ASSERT_TRUE( xc == x ); // const xc is referenceable, non-const x is not
-    // ASSERT_TRUE( x == xc );
+    // ASSERT_EQ( xc ,  x ); // const xc is referenceable, non-const x is not
+    // ASSERT_EQ( x ,  xc );
 
     check_auto_conversion_to_const(x, xc);
   }
diff --git a/packages/kokkos/core/unit_test/TestAtomics.hpp b/packages/kokkos/core/unit_test/TestAtomics.hpp
index e41ad5257d64ad3acb3266a0354f18d291662377..f2993914a11560f30cf70aabacf444f3b38e9bfc 100644
--- a/packages/kokkos/core/unit_test/TestAtomics.hpp
+++ b/packages/kokkos/core/unit_test/TestAtomics.hpp
@@ -97,7 +97,7 @@ struct SuperScalar {
   }
 
   KOKKOS_INLINE_FUNCTION
-  SuperScalar operator+(const SuperScalar& src) {
+  SuperScalar operator+(const SuperScalar& src) const {
     SuperScalar tmp = *this;
     for (int i = 0; i < N; i++) {
       tmp.val[i] += src.val[i];
@@ -540,8 +540,6 @@ TEST(TEST_CATEGORY, atomics) {
 
 // FIXME_SYCL atomics for large types to be implemented
 #ifndef KOKKOS_ENABLE_SYCL
-  // FIXME_HIP HIP doesn't yet support atomics for >64bit types properly
-#ifndef KOKKOS_ENABLE_HIP
   ASSERT_TRUE(
       (TestAtomic::Loop<Kokkos::complex<double>, TEST_EXECSPACE>(1, 1)));
   ASSERT_TRUE(
@@ -567,7 +565,6 @@ TEST(TEST_CATEGORY, atomics) {
 #endif
 #endif
 #endif
-#endif
 }
 
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestComplex.hpp b/packages/kokkos/core/unit_test/TestComplex.hpp
index b926058ebf990b0c7d0bff6f4c22b5bd4c12e2e8..be0c1e50d7013efc177f427b481a4b67b1441744 100644
--- a/packages/kokkos/core/unit_test/TestComplex.hpp
+++ b/packages/kokkos/core/unit_test/TestComplex.hpp
@@ -515,4 +515,44 @@ TEST(TEST_CATEGORY, complex_issue_3867) {
 #undef CHECK_POW_COMPLEX_PROMOTION
 }
 
+TEST(TEST_CATEGORY, complex_operations_arithmetic_types_overloads) {
+#define STATIC_ASSERT(cond) static_assert(cond, "")
+
+  STATIC_ASSERT(Kokkos::real(1) == 1.);
+  STATIC_ASSERT(Kokkos::real(2.f) == 2.f);
+  STATIC_ASSERT(Kokkos::real(3.) == 3.);
+  STATIC_ASSERT(Kokkos::real(4.l) == 4.l);
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::real(1)), double>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::real(2.f)), float>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::real(3.)), double>::value));
+  STATIC_ASSERT(
+      (std::is_same<decltype(Kokkos::real(4.l)), long double>::value));
+
+  STATIC_ASSERT(Kokkos::imag(1) == 0.);
+  STATIC_ASSERT(Kokkos::imag(2.f) == 0.f);
+  STATIC_ASSERT(Kokkos::imag(3.) == 0.);
+  STATIC_ASSERT(Kokkos::imag(4.l) == 0.l);
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::imag(1)), double>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::imag(2.f)), float>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::imag(3.)), double>::value));
+  STATIC_ASSERT(
+      (std::is_same<decltype(Kokkos::real(4.l)), long double>::value));
+
+  // FIXME in principle could be checked at compile time too
+  ASSERT_EQ(Kokkos::conj(1), Kokkos::complex<double>(1));
+  ASSERT_EQ(Kokkos::conj(2.f), Kokkos::complex<float>(2.f));
+  ASSERT_EQ(Kokkos::conj(3.), Kokkos::complex<double>(3.));
+  ASSERT_EQ(Kokkos::conj(4.l), Kokkos::complex<long double>(4.l));
+  STATIC_ASSERT((
+      std::is_same<decltype(Kokkos::conj(1)), Kokkos::complex<double>>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::conj(2.f)),
+                              Kokkos::complex<float>>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::conj(3.)),
+                              Kokkos::complex<double>>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::conj(4.l)),
+                              Kokkos::complex<long double>>::value));
+
+#undef STATIC_ASSERT
+}
+
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp b/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp
index 49f8daf89eabca9b3aa7e1f06d7a10ceb23a6a24..f487a015fbf261f85bf2b8a0b4755dadcefe2f32 100644
--- a/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp
+++ b/packages/kokkos/core/unit_test/TestDeepCopyAlignment.hpp
@@ -296,7 +296,7 @@ struct TestDeepCopyScalarConversion {
 
     int64_t errors = 0;
     Kokkos::deep_copy(errors, error_count);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
 
     Kokkos::deep_copy(view_s1_1d, static_cast<Scalar1>(0));
     Kokkos::deep_copy(view_s1_2d, static_cast<Scalar1>(0));
@@ -306,7 +306,7 @@ struct TestDeepCopyScalarConversion {
                                              Kokkos::IndexType<int64_t>>(0, N0),
                          *this);
     Kokkos::deep_copy(errors, error_count);
-    ASSERT_TRUE(errors > 0);
+    ASSERT_GT(errors, 0);
 
     Kokkos::deep_copy(error_count, 0);
     Kokkos::deep_copy(TEST_EXECSPACE(), view_s1_1d, view_s2_1d);
@@ -318,7 +318,7 @@ struct TestDeepCopyScalarConversion {
                          *this);
 
     Kokkos::deep_copy(errors, error_count);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 };
 }  // namespace Impl
diff --git a/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp b/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
index 8a9263c8df5fcd1eab370837d9d2de92281e7aaa..90e485998ec08ba98716185298860fb4c407daf2 100644
--- a/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
+++ b/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
@@ -79,7 +79,7 @@ char** init_kokkos_args(bool do_threads, bool do_numa, bool do_device,
   int numa_idx    = (do_other ? 3 : 0) + (do_threads ? 1 : 0);
   int device_idx =
       (do_other ? 3 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0);
-  int tune_idx = (do_other ? 3 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0) +
+  int tune_idx = (do_other ? 4 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0) +
                  (do_device ? 1 : 0);
 
   if (do_threads) {
diff --git a/packages/kokkos/core/unit_test/TestDetectionIdiom.cpp b/packages/kokkos/core/unit_test/TestDetectionIdiom.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f87fda615643c5a75ef5ee4da7349bab0eea40cd
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestDetectionIdiom.cpp
@@ -0,0 +1,96 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_DetectionIdiom.hpp>
+
+#define STATIC_ASSERT(cond) static_assert(cond, "");
+
+void test_nonesuch() {
+  using Kokkos::nonesuch;
+  STATIC_ASSERT(!std::is_constructible<nonesuch>::value);
+  STATIC_ASSERT(!std::is_destructible<nonesuch>::value);
+  STATIC_ASSERT(!std::is_copy_constructible<nonesuch>::value);
+  STATIC_ASSERT(!std::is_move_constructible<nonesuch>::value);
+#ifdef KOKKOS_ENABLE_CXX17
+  STATIC_ASSERT(!std::is_aggregate<nonesuch>::value);
+#endif
+}
+
+#undef STATIC_ASSERT
+
+namespace Example {
+// Example from https://en.cppreference.com/w/cpp/experimental/is_detected
+template <class T>
+using copy_assign_t = decltype(std::declval<T&>() = std::declval<const T&>());
+
+struct Meow {};
+struct Purr {
+  void operator=(const Purr&) = delete;
+};
+
+static_assert(Kokkos::is_detected<copy_assign_t, Meow>::value,
+              "Meow should be copy assignable!");
+static_assert(!Kokkos::is_detected<copy_assign_t, Purr>::value,
+              "Purr should not be copy assignable!");
+static_assert(Kokkos::is_detected_exact<Meow&, copy_assign_t, Meow>::value,
+              "Copy assignment of Meow should return Meow&!");
+
+template <class T>
+using diff_t = typename T::difference_type;
+
+template <class Ptr>
+using difference_type = Kokkos::detected_or_t<std::ptrdiff_t, diff_t, Ptr>;
+
+struct Woof {
+  using difference_type = int;
+};
+struct Bark {};
+
+static_assert(std::is_same<difference_type<Woof>, int>::value,
+              "Woof's difference_type should be int!");
+static_assert(std::is_same<difference_type<Bark>, std::ptrdiff_t>::value,
+              "Bark's difference_type should be ptrdiff_t!");
+}  // namespace Example
+
+int main() {}
diff --git a/packages/kokkos/core/unit_test/TestExecSpacePartitioning.hpp b/packages/kokkos/core/unit_test/TestExecSpacePartitioning.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f8f5275d3dd66ce42256bf61637112d026687a3a
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestExecSpacePartitioning.hpp
@@ -0,0 +1,129 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+namespace {
+struct SumFunctor {
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i, int& lsum) const { lsum += i; }
+};
+
+template <class ExecSpace>
+void check_distinctive(ExecSpace, ExecSpace) {}
+
+#ifdef KOKKOS_ENABLE_CUDA
+void check_distinctive(Kokkos::Cuda exec1, Kokkos::Cuda exec2) {
+  ASSERT_NE(exec1.cuda_stream(), exec2.cuda_stream());
+}
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+void check_distinctive(Kokkos::Experimental::HIP exec1,
+                       Kokkos::Experimental::HIP exec2) {
+  ASSERT_NE(exec1.hip_stream(), exec2.hip_stream());
+}
+#endif
+#ifdef KOKKOS_ENABLE_SYCL
+void check_distinctive(Kokkos::Experimental::SYCL exec1,
+                       Kokkos::Experimental::SYCL exec2) {
+  ASSERT_NE(*exec1.impl_internal_space_instance()->m_queue,
+            *exec2.impl_internal_space_instance()->m_queue);
+}
+#endif
+}  // namespace
+
+void test_partitioning(std::vector<TEST_EXECSPACE>& instances) {
+  check_distinctive(instances[0], instances[1]);
+  int sum1, sum2;
+  int N = 3910;
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<TEST_EXECSPACE>(instances[0], 0, N), SumFunctor(),
+      sum1);
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<TEST_EXECSPACE>(instances[1], 0, N), SumFunctor(),
+      sum2);
+  ASSERT_EQ(sum1, sum2);
+  ASSERT_EQ(sum1, N * (N - 1) / 2);
+
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \
+    defined(KOKKOS_ENABLE_SYCL)
+  // Eliminate unused function warning
+  // (i.e. when compiling for Serial and CUDA, during Serial compilation the
+  // Cuda overload is unused ...)
+  if (sum1 != sum2) {
+#ifdef KOKKOS_ENABLE_CUDA
+    check_distinctive(Kokkos::Cuda(), Kokkos::Cuda());
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+    check_distinctive(Kokkos::Experimental::HIP(), Kokkos::Experimental::HIP());
+#endif
+#ifdef KOKKOS_ENABLE_SYCL
+    check_distinctive(Kokkos::Experimental::SYCL(),
+                      Kokkos::Experimental::SYCL());
+#endif
+  }
+#endif
+}
+
+TEST(TEST_CATEGORY, partitioning_by_args) {
+  auto instances =
+      Kokkos::Experimental::partition_space(TEST_EXECSPACE(), 1, 1.);
+  ASSERT_EQ(int(instances.size()), 2);
+  test_partitioning(instances);
+}
+
+TEST(TEST_CATEGORY, partitioning_by_vector) {
+  std::vector<int> weights{1, 1};
+  auto instances =
+      Kokkos::Experimental::partition_space(TEST_EXECSPACE(), weights);
+  ASSERT_EQ(int(instances.size()), 2);
+  test_partitioning(instances);
+}
+}  // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_a.cpp b/packages/kokkos/core/unit_test/TestExecutionSpace.hpp
similarity index 68%
rename from packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_a.cpp
rename to packages/kokkos/core/unit_test/TestExecutionSpace.hpp
index 316a2b5d0fe0dba2c9b74f3f6f7a6d61342d2c4c..8e4331e809a806f8b7931735e445706abc2e06c5 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_a.cpp
+++ b/packages/kokkos/core/unit_test/TestExecutionSpace.hpp
@@ -42,5 +42,39 @@
 //@HEADER
 */
 
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewAPI_a.hpp>
+#include <cstdio>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+
+namespace {
+
+struct StructCopy {
+  Kokkos::DefaultExecutionSpace device;
+  Kokkos::DefaultHostExecutionSpace host;
+};
+
+template <class ExecutionSpace>
+void check_struct_copy() {
+#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
+  // FIXME_OPENMPTARGET nvlink error: Undefined reference to
+  // '_ZSt25__throw_bad_function_callv' in
+  // '/tmp/TestOpenMPTarget_ExecutionSpace-434d81.cubin'
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
+  StructCopy data;
+  parallel_for(
+      Kokkos::RangePolicy<ExecutionSpace>(0, 1), KOKKOS_LAMBDA(int) {
+        StructCopy data2 = data;
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF("%i \n", data2.device.in_parallel());
+      });
+#endif
+#endif
+}
+
+}  // namespace
+
+TEST(TEST_CATEGORY, copy_structure) { check_struct_copy<TEST_EXECSPACE>(); }
+}  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestHalfConversion.hpp b/packages/kokkos/core/unit_test/TestHalfConversion.hpp
index 277fb1b04234e58b0fe3d1639e48fcd1dc51ff86..992f56cc6b833882676f71817bae3b6bd03631d6 100644
--- a/packages/kokkos/core/unit_test/TestHalfConversion.hpp
+++ b/packages/kokkos/core/unit_test/TestHalfConversion.hpp
@@ -53,7 +53,7 @@ void test_half_conversion_type() {
   T base                         = static_cast<T>(3.3);
   Kokkos::Experimental::half_t a = Kokkos::Experimental::cast_to_half(base);
   T b                            = Kokkos::Experimental::cast_from_half<T>(a);
-  ASSERT_TRUE((double(b - base) / double(base)) < epsilon);
+  ASSERT_LT((double(b - base) / double(base)), epsilon);
 
 // TODO: Remove ifndef once https://github.com/kokkos/kokkos/pull/3480 merges
 #ifndef KOKKOS_ENABLE_SYCL
@@ -67,7 +67,7 @@ void test_half_conversion_type() {
       });
 
   Kokkos::deep_copy(b, b_v);
-  ASSERT_TRUE((double(b - base) / double(base)) < epsilon);
+  ASSERT_LT((double(b - base) / double(base)), epsilon);
 #endif  // KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
 #endif  // KOKKOS_ENABLE_SYCL
 }
diff --git a/packages/kokkos/core/unit_test/TestHalfOperators.hpp b/packages/kokkos/core/unit_test/TestHalfOperators.hpp
index db52a05d5d36d5919e101f60dd7652c92771c885..c4cf8a745701897a2a36c38540a94149650b5e2d 100644
--- a/packages/kokkos/core/unit_test/TestHalfOperators.hpp
+++ b/packages/kokkos/core/unit_test/TestHalfOperators.hpp
@@ -269,6 +269,85 @@ enum OP_TESTS {
   N_OP_TESTS
 };
 
+template <class view_type>
+struct Functor_TestHalfVolatileOperators {
+  volatile half_t h_lhs, h_rhs;
+  view_type actual_lhs, expected_lhs;
+  double d_lhs, d_rhs;
+  Functor_TestHalfVolatileOperators(volatile half_t lhs = half_t(0),
+                                    volatile half_t rhs = half_t(0))
+      : h_lhs(lhs), h_rhs(rhs) {
+    actual_lhs   = view_type("actual_lhs", N_OP_TESTS);
+    expected_lhs = view_type("expected_lhs", N_OP_TESTS);
+    d_lhs        = cast_from_half<double>(h_lhs);
+    d_rhs        = cast_from_half<double>(h_rhs);
+    if (std::is_same<view_type, ViewTypeHost>::value) {
+      auto run_on_host = *this;
+      run_on_host(0);
+    } else {
+      Kokkos::parallel_for("Test::Functor_TestHalfVolatileOperators",
+                           Kokkos::RangePolicy<ExecutionSpace>(0, 1), *this);
+    }
+  }
+
+  KOKKOS_FUNCTION
+  void operator()(int) const {
+    volatile half_t tmp_lhs;
+
+    // Initialze output views to catch missing test invocations
+    for (int i = 0; i < N_OP_TESTS; ++i) {
+      actual_lhs(i)   = 1;
+      expected_lhs(i) = -1;
+    }
+
+    tmp_lhs              = h_lhs;
+    actual_lhs(ASSIGN)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(ASSIGN) = d_lhs;
+
+    actual_lhs(LT)   = h_lhs < h_rhs;
+    expected_lhs(LT) = d_lhs < d_rhs;
+
+    actual_lhs(LE)   = h_lhs <= h_rhs;
+    expected_lhs(LE) = d_lhs <= d_rhs;
+
+    actual_lhs(NEQ)   = h_lhs != h_rhs;
+    expected_lhs(NEQ) = d_lhs != d_rhs;
+
+    actual_lhs(GT)   = h_lhs > h_rhs;
+    expected_lhs(GT) = d_lhs > d_rhs;
+
+    actual_lhs(GE)   = h_lhs >= h_rhs;
+    expected_lhs(GE) = d_lhs >= d_rhs;
+
+    actual_lhs(EQ)   = h_lhs == h_rhs;
+    expected_lhs(EQ) = d_lhs == d_rhs;
+
+    tmp_lhs = h_lhs;
+    tmp_lhs += h_rhs;
+    actual_lhs(CADD_H_H)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(CADD_H_H) = d_lhs;
+    expected_lhs(CADD_H_H) += d_rhs;
+
+    tmp_lhs = h_lhs;
+    tmp_lhs -= h_rhs;
+    actual_lhs(CSUB_H_H)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(CSUB_H_H) = d_lhs;
+    expected_lhs(CSUB_H_H) -= d_rhs;
+
+    tmp_lhs = h_lhs;
+    tmp_lhs *= h_rhs;
+    actual_lhs(CMUL_H_H)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(CMUL_H_H) = d_lhs;
+    expected_lhs(CMUL_H_H) *= d_rhs;
+
+    tmp_lhs = h_lhs;
+    tmp_lhs /= h_rhs;
+    actual_lhs(CDIV_H_H)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(CDIV_H_H) = d_lhs;
+    expected_lhs(CDIV_H_H) /= d_rhs;
+  }
+};
+
 template <class view_type>
 struct Functor_TestHalfOperators {
   half_t h_lhs, h_rhs;
@@ -840,8 +919,33 @@ void __test_half_operators(half_t h_lhs, half_t h_rhs) {
                 epsilon);
   }
 
-  // Check whether half_t is trivially copyable
-  ASSERT_TRUE(std::is_trivially_copyable<half_t>::value);
+  // Test partial volatile support
+  volatile half_t _h_lhs = h_lhs;
+  volatile half_t _h_rhs = h_rhs;
+  Functor_TestHalfVolatileOperators<ViewType> f_volatile_device(_h_lhs, _h_rhs);
+  Functor_TestHalfVolatileOperators<ViewTypeHost> f_volatile_host(_h_lhs,
+                                                                  _h_rhs);
+
+  ExecutionSpace().fence();
+  Kokkos::deep_copy(f_device_actual_lhs, f_device.actual_lhs);
+  Kokkos::deep_copy(f_device_expected_lhs, f_device.expected_lhs);
+  for (int op_test = 0; op_test < N_OP_TESTS; op_test++) {
+    // printf("op_test = %d\n", op_test);
+    if (op_test == ASSIGN || op_test == LT || op_test == LE || op_test == NEQ ||
+        op_test == EQ || op_test == GT || op_test == GE ||
+        op_test == CADD_H_H || op_test == CSUB_H_H || op_test == CMUL_H_H ||
+        op_test == CDIV_H_H) {
+      ASSERT_NEAR(f_device_actual_lhs(op_test), f_device_expected_lhs(op_test),
+                  epsilon);
+      ASSERT_NEAR(f_host.actual_lhs(op_test), f_host.expected_lhs(op_test),
+                  epsilon);
+    }
+  }
+
+  // is_trivially_copyable is false with the addition of explicit
+  // copy constructors that are required for supporting reductions
+  // ASSERT_TRUE(std::is_trivially_copyable<half_t>::value);
+
   constexpr size_t n       = 2;
   constexpr size_t n_bytes = sizeof(half_t) * n;
   const half_t h_arr0 = half_t(0x89ab), h_arr1 = half_t(0xcdef);
@@ -854,11 +958,11 @@ void __test_half_operators(half_t h_lhs, half_t h_rhs) {
   h_arr_ptr = reinterpret_cast<char*>(h_arr);
 
   std::memcpy(c_arr, h_arr, n_bytes);
-  for (i = 0; i < n_bytes; i++) ASSERT_TRUE(c_arr[i] == h_arr_ptr[i]);
+  for (i = 0; i < n_bytes; i++) ASSERT_EQ(c_arr[i], h_arr_ptr[i]);
 
   std::memcpy(h_arr, c_arr, n_bytes);
-  ASSERT_TRUE(h_arr[0] == h_arr0);
-  ASSERT_TRUE(h_arr[1] == h_arr1);
+  ASSERT_EQ(h_arr[0], h_arr0);
+  ASSERT_EQ(h_arr[1], h_arr1);
 }
 
 void test_half_operators() {
@@ -870,7 +974,6 @@ void test_half_operators() {
     // TODO: __test_half_operators(h_lhs + cast_to_half(i + 1), half_t(0));
     // TODO: __test_half_operators(half_t(0), h_rhs + cast_to_half(i));
   }
-  // TODO: __test_half_operators(0, 0);
 }
 
 TEST(TEST_CATEGORY, half_operators) { test_half_operators(); }
diff --git a/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp b/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp
index 18d1ac85188ca17cd7d127d3187103f42402be18..10180251ba582e9d11672ce74b4d22335c9da3d4 100644
--- a/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp
+++ b/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp
@@ -52,14 +52,17 @@ using Kokkos::Impl::HostSharedPtr;
 namespace {
 
 class Data {
-  Kokkos::Array<char, 64> d;
+  char d[64];
 
  public:
-  KOKKOS_FUNCTION void write(char const* c) {
-    for (int i = 0; i < 64 && c; ++i, ++c) {
-      d[i] = *c;
-    }
+  // Because strncpy is not supported within device code
+  static KOKKOS_FUNCTION void my_strncpy(char* dst, const char* src,
+                                         size_t cnt) {
+    while (cnt-- > 0 && (*dst++ = *src++) != '\0')
+      ;
+    while (cnt-- > 0) *dst++ = '\0';
   }
+  KOKKOS_FUNCTION void write(char const* s) { my_strncpy(d, s, sizeof(d)); }
 };
 
 template <class SmartPtr>
@@ -154,3 +157,135 @@ TEST(TEST_CATEGORY, host_shared_ptr_special_members_on_device) {
   check_special_members_on_device(device_ptr);
 }
 #endif
+
+// FIXME_OPENMPTARGET
+#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) && \
+    !defined(KOKKOS_ENABLE_OPENMPTARGET)
+namespace {
+
+struct Bar {
+  double val;
+};
+
+struct Foo {
+  Foo(bool allocate = false) : ptr(allocate ? new Bar : nullptr) {}
+  Kokkos::Impl::HostSharedPtr<Bar> ptr;
+  int use_count() { return ptr.use_count(); }
+};
+
+template <class DevMemSpace, class HostMemSpace>
+void host_shared_ptr_test_reference_counting() {
+  using ExecSpace = typename DevMemSpace::execution_space;
+  bool is_gpu =
+      !Kokkos::SpaceAccessibility<ExecSpace, Kokkos::HostSpace>::accessible;
+
+  // Create two tracked instances
+  Foo f1(true), f2(true);
+  // Scope Views
+  {
+    Foo* fp_d_ptr =
+        static_cast<Foo*>(Kokkos::kokkos_malloc<DevMemSpace>(sizeof(Foo)));
+    Kokkos::View<Foo, DevMemSpace> fp_d(fp_d_ptr);
+    // If using UVM or on the CPU don't make an extra HostCopy
+    Foo* fp_h_ptr = std::is_same<DevMemSpace, HostMemSpace>::value
+                        ? fp_d_ptr
+                        : static_cast<Foo*>(
+                              Kokkos::kokkos_malloc<HostMemSpace>(sizeof(Foo)));
+    Kokkos::View<Foo, HostMemSpace> fp_h(fp_h_ptr);
+    ASSERT_EQ(1, f1.use_count());
+    ASSERT_EQ(1, f2.use_count());
+
+    // Just for the sake of it initialize the data of the host copy
+    new (fp_h.data()) Foo();
+    // placement new in kernel
+    //  if on GPU: should not increase use_count, fp_d will not be tracked
+    //  if on Host: refcount will increase fp_d is tracked
+    Kokkos::parallel_for(
+        Kokkos::RangePolicy<ExecSpace>(0, 1),
+        KOKKOS_LAMBDA(int) { new (fp_d.data()) Foo(f1); });
+    Kokkos::fence();
+    Kokkos::deep_copy(fp_h, fp_d);
+
+    if (is_gpu)
+      ASSERT_EQ(1, f1.use_count());
+    else
+      ASSERT_EQ(2, f1.use_count());
+    ASSERT_EQ(1, f2.use_count());
+
+    // assignment operator on host, will increase f2 use_count
+    //   if default device is GPU: fp_h was untracked
+    //   if default device is CPU: fp_h was tracked and use_count was 2 for
+    //   aliasing f1, in which case use_count will be decreased here
+    fp_h() = f2;
+    ASSERT_EQ(1, f1.use_count());
+    ASSERT_EQ(2, f2.use_count());
+
+    Kokkos::deep_copy(fp_d, fp_h);
+    ASSERT_EQ(1, f1.use_count());
+    ASSERT_EQ(2, f2.use_count());
+
+    // assignment in kernel:
+    //  If on GPU: should not increase use_count of f1 and fp_d will not be
+    //  tracked.
+    //  If on Host: use_count will increase of f1, fp_d is tracked,
+    //  use_count of f2 goes down.
+    //  Since we are messing with the use count on the device: make host copy
+    //  untracked first. Note if fp_d and fp_h alias each other (e.g. compiling
+    //  for CPU only) that means fp_d() will be untracked too during assignemnt
+    fp_h() = Foo();
+    Kokkos::parallel_for(
+        Kokkos::RangePolicy<ExecSpace>(0, 1),
+        KOKKOS_LAMBDA(int) { fp_d() = f1; });
+    Kokkos::fence();
+    Kokkos::deep_copy(fp_h, fp_d);
+
+    if (is_gpu)
+      ASSERT_EQ(1, f1.use_count());
+    else
+      ASSERT_EQ(2, f1.use_count());
+    ASSERT_EQ(1, f2.use_count());
+
+    // Assign non-tracked ptr
+    //   if  if_gpu will not change use_count
+    //   if !is_gpu will decrease use_count of f1
+    fp_h() = Foo();
+    ASSERT_EQ(1, f1.use_count());
+    ASSERT_EQ(1, f2.use_count());
+    fp_h() = f2;
+    ASSERT_EQ(1, f1.use_count());
+    ASSERT_EQ(2, f2.use_count());
+
+    // before deleting host version make sure its not tracked
+    fp_h() = Foo();
+    if (fp_h_ptr != fp_d_ptr) Kokkos::kokkos_free<HostMemSpace>(fp_h_ptr);
+    Kokkos::kokkos_free<DevMemSpace>(fp_d_ptr);
+  }
+
+  ASSERT_EQ(1, f1.use_count());
+  ASSERT_EQ(1, f2.use_count());
+}
+}  // namespace
+
+TEST(TEST_CATEGORY, host_shared_ptr_tracking) {
+  host_shared_ptr_test_reference_counting<typename TEST_EXECSPACE::memory_space,
+                                          Kokkos::HostSpace>();
+#ifdef KOKKOS_ENABLE_CUDA
+  if (std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value)
+    host_shared_ptr_test_reference_counting<Kokkos::CudaUVMSpace,
+                                            Kokkos::CudaUVMSpace>();
+#endif
+#ifdef KOKKOS_ENABLE_SYCL
+  if (std::is_same<TEST_EXECSPACE, Kokkos::Experimental::SYCL>::value)
+    host_shared_ptr_test_reference_counting<
+        Kokkos::Experimental::SYCLSharedUSMSpace,
+        Kokkos::Experimental::SYCLSharedUSMSpace>();
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+  if (std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value)
+    host_shared_ptr_test_reference_counting<
+        Kokkos::Experimental::HIPHostPinnedSpace,
+        Kokkos::Experimental::HIPHostPinnedSpace>();
+#endif
+}
+
+#endif  // KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
diff --git a/packages/kokkos/core/unit_test/TestInterOp.cpp b/packages/kokkos/core/unit_test/TestInterOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7f08afada9d7c74e6d949949a6f52c2b7e3b7ada
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestInterOp.cpp
@@ -0,0 +1,162 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_DynRankView.hpp>
+#include <KokkosExp_InterOp.hpp>
+
+// View
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<Kokkos::View<double*>>,
+        Kokkos::View<
+            double*, typename Kokkos::DefaultExecutionSpace::array_layout,
+            typename Kokkos::DefaultExecutionSpace::memory_space>>::value,
+    "Error! Unexpected python_view_type for: View");
+
+// DynRankView
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<Kokkos::DynRankView<double>>,
+        Kokkos::DynRankView<
+            double, typename Kokkos::DefaultExecutionSpace::array_layout,
+            typename Kokkos::DefaultExecutionSpace::memory_space>>::value,
+    "Error! Unexpected python_view_type for: DynRankView");
+
+// View + Execution Space
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<
+            Kokkos::View<double*, Kokkos::DefaultExecutionSpace>>,
+        Kokkos::View<
+            double*, typename Kokkos::DefaultExecutionSpace::array_layout,
+            typename Kokkos::DefaultExecutionSpace::memory_space>>::value,
+    "Error! Unexpected python_view_type for: View + Execution Space");
+
+// DynRankView + Execution Space
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<
+            Kokkos::DynRankView<double, Kokkos::DefaultExecutionSpace>>,
+        Kokkos::DynRankView<
+            double, typename Kokkos::DefaultExecutionSpace::array_layout,
+            typename Kokkos::DefaultExecutionSpace::memory_space>>::value,
+    "Error! Unexpected python_view_type for: DynRankView + Execution Space");
+
+// View + Memory space
+static_assert(std::is_same<Kokkos::Experimental::python_view_type_t<
+                               Kokkos::View<int64_t*, Kokkos::HostSpace>>,
+                           Kokkos::View<int64_t*, Kokkos::LayoutRight,
+                                        Kokkos::HostSpace>>::value,
+              "Error! Unexpected python_view_type for: View + Memory space");
+
+// DynRankView + Memory space
+static_assert(
+    std::is_same<Kokkos::Experimental::python_view_type_t<
+                     Kokkos::DynRankView<int16_t, Kokkos::HostSpace>>,
+                 Kokkos::DynRankView<int16_t, Kokkos::LayoutRight,
+                                     Kokkos::HostSpace>>::value,
+    "Error! Unexpected python_view_type for: DynRankView + Memory space");
+
+// View + Layout + Execution space
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<Kokkos::View<
+            int**, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace>>,
+        Kokkos::View<int**, Kokkos::LayoutLeft,
+                     typename Kokkos::DefaultExecutionSpace::memory_space>>::
+        value,
+    "Error! Unexpected python_view_type for: View + Layout + Execution space");
+
+// DynRankView + Layout + Execution space
+static_assert(
+    std::is_same<Kokkos::Experimental::python_view_type_t<Kokkos::DynRankView<
+                     int, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace>>,
+                 Kokkos::DynRankView<int, Kokkos::LayoutLeft,
+                                     typename Kokkos::DefaultExecutionSpace::
+                                         memory_space>>::value,
+    "Error! Unexpected python_view_type for: DynRankView + Layout + Execution "
+    "space");
+
+// View + Layout + Memory Space
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<
+            Kokkos::View<uint32_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>>,
+        Kokkos::View<uint32_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>>::value,
+    "Error! Unexpected python_view_type for: View + Layout + Memory Space");
+
+// DynRankView + Layout + Memory Space
+static_assert(
+    std::is_same<Kokkos::Experimental::python_view_type_t<Kokkos::DynRankView<
+                     uint64_t, Kokkos::LayoutLeft, Kokkos::HostSpace>>,
+                 Kokkos::DynRankView<uint64_t, Kokkos::LayoutLeft,
+                                     Kokkos::HostSpace>>::value,
+    "Error! Unexpected python_view_type for: DynRankView + Layout + Memory "
+    "Space");
+
+// View + Layout + Execution space + Memory Trait
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<Kokkos::View<
+            float***, Kokkos::LayoutLeft, Kokkos::DefaultHostExecutionSpace,
+            Kokkos::MemoryTraits<Kokkos::RandomAccess>>>,
+        Kokkos::View<float***, Kokkos::LayoutLeft,
+                     typename Kokkos::DefaultHostExecutionSpace::memory_space,
+                     Kokkos::MemoryTraits<Kokkos::RandomAccess>>>::value,
+    "Error! Unexpected python_view_type for: View + Layout + Execution space + "
+    "Memory Trait");
+
+// DynRankView + Layout + Execution space  + Memory trait
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<Kokkos::DynRankView<
+            float, Kokkos::LayoutLeft, Kokkos::DefaultHostExecutionSpace,
+            Kokkos::MemoryTraits<Kokkos::Atomic>>>,
+        Kokkos::DynRankView<
+            float, Kokkos::LayoutLeft,
+            typename Kokkos::DefaultHostExecutionSpace::memory_space,
+            Kokkos::MemoryTraits<Kokkos::Atomic>>>::value,
+    "Error! Unexpected python_view_type for: DynRankView + Layout + Execution "
+    "space  + Memory trait");
diff --git a/packages/kokkos/core/unit_test/TestMDRange.hpp b/packages/kokkos/core/unit_test/TestMDRange.hpp
index 5618e40989b185a0233de2b20d6dec6636c9fe51..57461be714cde62bdd9b370c834759b91a13da92 100644
--- a/packages/kokkos/core/unit_test/TestMDRange.hpp
+++ b/packages/kokkos/core/unit_test/TestMDRange.hpp
@@ -2751,9 +2751,18 @@ struct TestMDRange_6D {
                            const int N3, const int N4, const int N5) {
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<128, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -2772,9 +2781,18 @@ struct TestMDRange_6D {
 #endif
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -2807,9 +2825,18 @@ struct TestMDRange_6D {
 
     // Test with reducers - scalar
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
 #ifdef KOKKOS_ENABLE_SYCL
       range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}},
                        {{3, 3, 3, 2, 2, 2}});
@@ -2832,9 +2859,18 @@ struct TestMDRange_6D {
 
     // Test with reducers - scalar + label
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
 
 #ifdef KOKKOS_ENABLE_SYCL
       range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}},
@@ -2858,9 +2894,19 @@ struct TestMDRange_6D {
 
     // Test with reducers - scalar view
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type =
+          typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
+                                         Kokkos::IndexType<int>,
+                                         Kokkos::LaunchBounds<512, 1>>;
+#endif
 #ifdef KOKKOS_ENABLE_SYCL
       range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}},
                        {{3, 3, 3, 2, 2, 2}});
@@ -2888,9 +2934,18 @@ struct TestMDRange_6D {
     // Test Min reducer with lambda
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<128, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
       range_type range({{1, 1, 1, 1, 1, 1}}, {{N0, N1, N2, N3, N4, N5}},
                        {{3, 3, 3, 2, 2, 1}});
 
@@ -2923,9 +2978,19 @@ struct TestMDRange_6D {
 
     // Tagged operator test
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type = typename Kokkos::MDRangePolicy<
           ExecSpace, Kokkos::Rank<6, Iterate::Default, Iterate::Default>,
           Kokkos::IndexType<int>, InitTag>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>,
+          Kokkos::Rank<6, Iterate::Default, Iterate::Default>,
+          Kokkos::IndexType<int>, InitTag>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -2977,9 +3042,18 @@ struct TestMDRange_6D {
                         const int N4, const int N5) {
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<128, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3028,8 +3102,16 @@ struct TestMDRange_6D {
 #endif
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>>;
+#endif
       using point_type = typename range_type::point_type;
 
       range_type range(point_type{{0, 0, 0, 0, 0, 0}},
@@ -3062,9 +3144,18 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>, InitTag>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>, InitTag>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3115,9 +3206,18 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3158,9 +3258,19 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type = typename Kokkos::MDRangePolicy<
           ExecSpace, Kokkos::Rank<6, Iterate::Default, Iterate::Default>,
           Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>,
+          Kokkos::Rank<6, Iterate::Default, Iterate::Default>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3201,9 +3311,19 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type = typename Kokkos::MDRangePolicy<
           ExecSpace, Kokkos::Rank<6, Iterate::Left, Iterate::Left>,
           Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>,
+          Kokkos::Rank<6, Iterate::Left, Iterate::Left>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3244,9 +3364,19 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type = typename Kokkos::MDRangePolicy<
           ExecSpace, Kokkos::Rank<6, Iterate::Left, Iterate::Right>,
           Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>,
+          Kokkos::Rank<6, Iterate::Left, Iterate::Right>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3287,9 +3417,19 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type = typename Kokkos::MDRangePolicy<
           ExecSpace, Kokkos::Rank<6, Iterate::Right, Iterate::Left>,
           Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>,
+          Kokkos::Rank<6, Iterate::Right, Iterate::Left>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3330,9 +3470,19 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type = typename Kokkos::MDRangePolicy<
           ExecSpace, Kokkos::Rank<6, Iterate::Right, Iterate::Right>,
           Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>,
+          Kokkos::Rank<6, Iterate::Right, Iterate::Right>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3683,9 +3833,18 @@ struct TestMDRange_6D_NegIdx {
   static void test_6D_negidx(const int N0, const int N1, const int N2,
                              const int N3, const int N4, const int N5) {
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<256, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
diff --git a/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp b/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp
index 777f91aea3e560981d5dde05767f1726d8a1542f..b38871afaaf6a277f6080e34f1a81aac31f6fb93 100644
--- a/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp
+++ b/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp
@@ -601,7 +601,8 @@ TEST(TEST_CATEGORY, mathematical_functions_power_functions) {
   do_test_math_binary_function<TEST_EXECSPACE, kk_hypot>(2.f, 3.f);
   do_test_math_binary_function<TEST_EXECSPACE, kk_hypot>(2., 3.);
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
-#if !(defined(KOKKOS_ARCH_POWER8) || defined(KOKKOS_ARCH_POWER9))  // FIXME
+// FIXME: fails with gcc on Power platforms
+#if !(defined(KOKKOS_ARCH_POWER8) || defined(KOKKOS_ARCH_POWER9))
   do_test_math_binary_function<TEST_EXECSPACE, kk_hypot>(2.l, 3.l);
 #endif
 #endif
@@ -668,7 +669,13 @@ TEST(TEST_CATEGORY, mathematical_functions_exponential_functions) {
   TEST_MATH_FUNCTION(log10)({1234.l, 567.l, 89.l, .003l});
 #endif
 
+// FIXME_OPENMPTARGET FIXME_AMD
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) &&                           \
+    (defined(KOKKOS_ARCH_VEGA906) || defined(KOKKOS_ARCH_VEGA908) || \
+     defined(KOKKOS_ARCH_VEGA90A))
+
   TEST_MATH_FUNCTION(log2)({1, 23, 456, 7890});
+#endif
   TEST_MATH_FUNCTION(log2)({1l, 23l, 456l, 7890l});
   TEST_MATH_FUNCTION(log2)({1ll, 23ll, 456ll, 7890ll});
   TEST_MATH_FUNCTION(log2)({1u, 23u, 456u, 7890u});
@@ -869,3 +876,69 @@ TEST(TEST_CATEGORY,
 #endif
 #endif
 }
+
+template <class Space>
+struct TestAbsoluteValueFunction {
+  TestAbsoluteValueFunction() { run(); }
+  void run() const {
+    int errors = 0;
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<Space>(0, 1), *this, errors);
+    ASSERT_EQ(errors, 0);
+  }
+  KOKKOS_FUNCTION void operator()(int, int& e) const {
+    using Kokkos::Experimental::abs;
+    if (abs(1) != 1 || abs(-1) != 1) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(int)\n");
+    }
+    if (abs(2l) != 2l || abs(-2l) != 2l) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(long int)\n");
+    }
+    if (abs(3ll) != 3ll || abs(-3ll) != 3ll) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(long long int)\n");
+    }
+    if (abs(4.f) != 4.f || abs(-4.f) != 4.f) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(float)\n");
+    }
+    if (abs(5.) != 5. || abs(-5.) != 5.) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(double)\n");
+    }
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+    if (abs(6.l) != 6.l || abs(-6.l) != 6.l) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(long double)\n");
+    }
+#endif
+    // special values
+    using Kokkos::Experimental::isinf;
+    using Kokkos::Experimental::isnan;
+    if (abs(-0.) != 0.
+    // WORKAROUND icpx changing default FP model when optimization level is >= 1
+    // using -fp-model=precise works too
+#ifndef __INTEL_LLVM_COMPILER
+        || !isinf(abs(-INFINITY)) || !isnan(abs(-NAN))
+#endif
+    ) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "failed abs(floating_point) special values\n");
+    }
+
+    static_assert(std::is_same<decltype(abs(1)), int>::value, "");
+    static_assert(std::is_same<decltype(abs(2l)), long>::value, "");
+    static_assert(std::is_same<decltype(abs(3ll)), long long>::value, "");
+    static_assert(std::is_same<decltype(abs(4.f)), float>::value, "");
+    static_assert(std::is_same<decltype(abs(5.)), double>::value, "");
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+    static_assert(std::is_same<decltype(abs(6.l)), long double>::value, "");
+#endif
+  }
+};
+
+TEST(TEST_CATEGORY, mathematical_functions_absolute_value) {
+  TestAbsoluteValueFunction<TEST_EXECSPACE>();
+}
diff --git a/packages/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp b/packages/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2d9b4db6bdef50c48a7010d907fb9abf02e05c35
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp
@@ -0,0 +1,1895 @@
+#include <fstream>
+#include <gtest/gtest.h>
+#include "Kokkos_Core.hpp"
+
+namespace Test {
+
+struct TestLargeArgTag {};
+struct TestRealErfcxTag {};
+
+template <class ExecSpace>
+struct TestExponentialIntergral1Function {
+  using ViewType     = Kokkos::View<double*, ExecSpace>;
+  using HostViewType = Kokkos::View<double*, Kokkos::HostSpace>;
+
+  ViewType d_x, d_expint;
+  typename ViewType::HostMirror h_x, h_expint;
+  HostViewType h_ref;
+
+  void testit() {
+    using Kokkos::Experimental::fabs;
+    using Kokkos::Experimental::infinity;
+
+    d_x      = ViewType("d_x", 15);
+    d_expint = ViewType("d_expint", 15);
+    h_x      = Kokkos::create_mirror_view(d_x);
+    h_expint = Kokkos::create_mirror_view(d_expint);
+    h_ref    = HostViewType("h_ref", 15);
+
+    // Generate test inputs
+    h_x(0)  = -0.2;
+    h_x(1)  = 0.0;
+    h_x(2)  = 0.2;
+    h_x(3)  = 0.8;
+    h_x(4)  = 1.6;
+    h_x(5)  = 5.1;
+    h_x(6)  = 0.01;
+    h_x(7)  = 0.001;
+    h_x(8)  = 1.0;
+    h_x(9)  = 1.001;
+    h_x(10) = 1.01;
+    h_x(11) = 1.1;
+    h_x(12) = 7.2;
+    h_x(13) = 10.3;
+    h_x(14) = 15.4;
+    Kokkos::deep_copy(d_x, h_x);
+
+    // Call exponential integral function
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, 15), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_expint, d_expint);
+
+    // Reference values computed with Octave
+    h_ref(0)  = -infinity<double>::value;  // x(0)=-0.2
+    h_ref(1)  = infinity<double>::value;   // x(1)= 0.0
+    h_ref(2)  = 1.222650544183893e+00;     // x(2) =0.2
+    h_ref(3)  = 3.105965785455429e-01;     // x(3) =0.8
+    h_ref(4)  = 8.630833369753976e-02;     // x(4) =1.6
+    h_ref(5)  = 1.021300107861738e-03;     // x(5) =5.1
+    h_ref(6)  = 4.037929576538113e+00;     // x(6) =0.01
+    h_ref(7)  = 6.331539364136149e+00;     // x(7) =0.001
+    h_ref(8)  = 2.193839343955205e-01;     // x(8) =1.0
+    h_ref(9)  = 2.190164225274689e-01;     // x(9) =1.001
+    h_ref(10) = 2.157416237944899e-01;     // x(10)=1.01
+    h_ref(11) = 1.859909045360401e-01;     // x(11)=1.1
+    h_ref(12) = 9.218811688716196e-05;     // x(12)=7.2
+    h_ref(13) = 2.996734771597901e-06;     // x(13)=10.3
+    h_ref(14) = 1.254522935050609e-08;     // x(14)=15.4
+
+    EXPECT_EQ(h_ref(0), h_expint(0));
+    EXPECT_EQ(h_ref(1), h_expint(1));
+    for (int i = 2; i < 15; i++) {
+      EXPECT_LE(std::abs(h_expint(i) - h_ref(i)), std::abs(h_ref(i)) * 1e-15);
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_expint(i) = Kokkos::Experimental::expint1(d_x(i));
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexErrorFunction {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+  using DblViewType     = Kokkos::View<double*, ExecSpace>;
+  using DblHostViewType = Kokkos::View<double*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_erf, d_erfcx;
+  typename ViewType::HostMirror h_z, h_erf, h_erfcx;
+  HostViewType h_ref_erf, h_ref_erfcx;
+
+  DblViewType d_x, d_erfcx_dbl;
+  typename DblViewType::HostMirror h_x, h_erfcx_dbl;
+  DblHostViewType h_ref_erfcx_dbl;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    d_z         = ViewType("d_z", 52);
+    d_erf       = ViewType("d_erf", 52);
+    d_erfcx     = ViewType("d_erfcx", 52);
+    h_z         = Kokkos::create_mirror_view(d_z);
+    h_erf       = Kokkos::create_mirror_view(d_erf);
+    h_erfcx     = Kokkos::create_mirror_view(d_erfcx);
+    h_ref_erf   = HostViewType("h_ref_erf", 52);
+    h_ref_erfcx = HostViewType("h_ref_erfcx", 52);
+
+    d_x             = DblViewType("d_x", 6);
+    d_erfcx_dbl     = DblViewType("d_erfcx_dbl", 6);
+    h_x             = Kokkos::create_mirror_view(d_x);
+    h_erfcx_dbl     = Kokkos::create_mirror_view(d_erfcx_dbl);
+    h_ref_erfcx_dbl = DblHostViewType("h_ref_erfcx_dbl", 6);
+
+    // Generate test inputs
+    // abs(z)<=2
+    h_z(0)  = Kokkos::complex<double>(0.0011, 0);
+    h_z(1)  = Kokkos::complex<double>(-0.0011, 0);
+    h_z(2)  = Kokkos::complex<double>(1.4567, 0);
+    h_z(3)  = Kokkos::complex<double>(-1.4567, 0);
+    h_z(4)  = Kokkos::complex<double>(0, 0.0011);
+    h_z(5)  = Kokkos::complex<double>(0, -0.0011);
+    h_z(6)  = Kokkos::complex<double>(0, 1.4567);
+    h_z(7)  = Kokkos::complex<double>(0, -1.4567);
+    h_z(8)  = Kokkos::complex<double>(1.4567, 0.0011);
+    h_z(9)  = Kokkos::complex<double>(1.4567, -0.0011);
+    h_z(10) = Kokkos::complex<double>(-1.4567, 0.0011);
+    h_z(11) = Kokkos::complex<double>(-1.4567, -0.0011);
+    h_z(12) = Kokkos::complex<double>(1.4567, 0.5942);
+    h_z(13) = Kokkos::complex<double>(1.4567, -0.5942);
+    h_z(14) = Kokkos::complex<double>(-1.4567, 0.5942);
+    h_z(15) = Kokkos::complex<double>(-1.4567, -0.5942);
+    h_z(16) = Kokkos::complex<double>(0.0011, 0.5942);
+    h_z(17) = Kokkos::complex<double>(0.0011, -0.5942);
+    h_z(18) = Kokkos::complex<double>(-0.0011, 0.5942);
+    h_z(19) = Kokkos::complex<double>(-0.0011, -0.5942);
+    h_z(20) = Kokkos::complex<double>(0.0011, 0.0051);
+    h_z(21) = Kokkos::complex<double>(0.0011, -0.0051);
+    h_z(22) = Kokkos::complex<double>(-0.0011, 0.0051);
+    h_z(23) = Kokkos::complex<double>(-0.0011, -0.0051);
+    // abs(z)>2.0 and x>1
+    h_z(24) = Kokkos::complex<double>(3.5, 0.0011);
+    h_z(25) = Kokkos::complex<double>(3.5, -0.0011);
+    h_z(26) = Kokkos::complex<double>(-3.5, 0.0011);
+    h_z(27) = Kokkos::complex<double>(-3.5, -0.0011);
+    h_z(28) = Kokkos::complex<double>(3.5, 9.7);
+    h_z(29) = Kokkos::complex<double>(3.5, -9.7);
+    h_z(30) = Kokkos::complex<double>(-3.5, 9.7);
+    h_z(31) = Kokkos::complex<double>(-3.5, -9.7);
+    h_z(32) = Kokkos::complex<double>(18.9, 9.7);
+    h_z(33) = Kokkos::complex<double>(18.9, -9.7);
+    h_z(34) = Kokkos::complex<double>(-18.9, 9.7);
+    h_z(35) = Kokkos::complex<double>(-18.9, -9.7);
+    // abs(z)>2.0 and 0<=x<=1 and abs(y)<6
+    h_z(36) = Kokkos::complex<double>(0.85, 3.5);
+    h_z(37) = Kokkos::complex<double>(0.85, -3.5);
+    h_z(38) = Kokkos::complex<double>(-0.85, 3.5);
+    h_z(39) = Kokkos::complex<double>(-0.85, -3.5);
+    h_z(40) = Kokkos::complex<double>(0.0011, 3.5);
+    h_z(41) = Kokkos::complex<double>(0.0011, -3.5);
+    h_z(42) = Kokkos::complex<double>(-0.0011, 3.5);
+    h_z(43) = Kokkos::complex<double>(-0.0011, -3.5);
+    // abs(z)>2.0 and 0<=x<=1 and abs(y)>=6
+    h_z(44) = Kokkos::complex<double>(0.85, 7.5);
+    h_z(45) = Kokkos::complex<double>(0.85, -7.5);
+    h_z(46) = Kokkos::complex<double>(-0.85, 7.5);
+    h_z(47) = Kokkos::complex<double>(-0.85, -7.5);
+    h_z(48) = Kokkos::complex<double>(0.85, 19.7);
+    h_z(49) = Kokkos::complex<double>(0.85, -19.7);
+    h_z(50) = Kokkos::complex<double>(-0.85, 19.7);
+    h_z(51) = Kokkos::complex<double>(-0.85, -19.7);
+
+    h_x(0) = -infinity<double>::value;
+    h_x(1) = -1.2;
+    h_x(2) = 0.0;
+    h_x(3) = 1.2;
+    h_x(4) = 10.5;
+    h_x(5) = infinity<double>::value;
+
+    Kokkos::deep_copy(d_z, h_z);
+    Kokkos::deep_copy(d_x, h_x);
+
+    // Call erf and erfcx functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, 52), *this);
+    Kokkos::fence();
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, TestRealErfcxTag>(0, 1),
+                         *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_erf, d_erf);
+    Kokkos::deep_copy(h_erfcx, d_erfcx);
+    Kokkos::deep_copy(h_erfcx_dbl, d_erfcx_dbl);
+
+    // Reference values computed with Octave
+    h_ref_erf(0) = Kokkos::complex<double>(0.001241216583181022, 0);
+    h_ref_erf(1) = Kokkos::complex<double>(-0.001241216583181022, 0);
+    h_ref_erf(2) = Kokkos::complex<double>(0.9606095744865353, 0);
+    h_ref_erf(3) = Kokkos::complex<double>(-0.9606095744865353, 0);
+    h_ref_erf(4) = Kokkos::complex<double>(0, 0.001241217584429469);
+    h_ref_erf(5) = Kokkos::complex<double>(0, -0.001241217584429469);
+    h_ref_erf(6) = Kokkos::complex<double>(0, 4.149756424218223);
+    h_ref_erf(7) = Kokkos::complex<double>(0, -4.149756424218223);
+    h_ref_erf(8) =
+        Kokkos::complex<double>(0.960609812745064, 0.0001486911741082233);
+    h_ref_erf(9) =
+        Kokkos::complex<double>(0.960609812745064, -0.0001486911741082233);
+    h_ref_erf(10) =
+        Kokkos::complex<double>(-0.960609812745064, 0.0001486911741082233);
+    h_ref_erf(11) =
+        Kokkos::complex<double>(-0.960609812745064, -0.0001486911741082233);
+    h_ref_erf(12) =
+        Kokkos::complex<double>(1.02408827958197, 0.04828570635603527);
+    h_ref_erf(13) =
+        Kokkos::complex<double>(1.02408827958197, -0.04828570635603527);
+    h_ref_erf(14) =
+        Kokkos::complex<double>(-1.02408827958197, 0.04828570635603527);
+    h_ref_erf(15) =
+        Kokkos::complex<double>(-1.02408827958197, -0.04828570635603527);
+    h_ref_erf(16) =
+        Kokkos::complex<double>(0.001766791817179109, 0.7585038120712589);
+    h_ref_erf(17) =
+        Kokkos::complex<double>(0.001766791817179109, -0.7585038120712589);
+    h_ref_erf(18) =
+        Kokkos::complex<double>(-0.001766791817179109, 0.7585038120712589);
+    h_ref_erf(19) =
+        Kokkos::complex<double>(-0.001766791817179109, -0.7585038120712589);
+    h_ref_erf(20) =
+        Kokkos::complex<double>(0.001241248867618165, 0.005754776682713324);
+    h_ref_erf(21) =
+        Kokkos::complex<double>(0.001241248867618165, -0.005754776682713324);
+    h_ref_erf(22) =
+        Kokkos::complex<double>(-0.001241248867618165, 0.005754776682713324);
+    h_ref_erf(23) =
+        Kokkos::complex<double>(-0.001241248867618165, -0.005754776682713324);
+    h_ref_erf(24) =
+        Kokkos::complex<double>(0.9999992569244941, 5.939313159932013e-09);
+    h_ref_erf(25) =
+        Kokkos::complex<double>(0.9999992569244941, -5.939313159932013e-09);
+    h_ref_erf(26) =
+        Kokkos::complex<double>(-0.9999992569244941, 5.939313159932013e-09);
+    h_ref_erf(27) =
+        Kokkos::complex<double>(-0.9999992569244941, -5.939313159932013e-09);
+    h_ref_erf(28) =
+        Kokkos::complex<double>(-1.915595842013002e+34, 1.228821279117683e+32);
+    h_ref_erf(29) =
+        Kokkos::complex<double>(-1.915595842013002e+34, -1.228821279117683e+32);
+    h_ref_erf(30) =
+        Kokkos::complex<double>(1.915595842013002e+34, 1.228821279117683e+32);
+    h_ref_erf(31) =
+        Kokkos::complex<double>(1.915595842013002e+34, -1.228821279117683e+32);
+    h_ref_erf(32) = Kokkos::complex<double>(1, 5.959897539826596e-117);
+    h_ref_erf(33) = Kokkos::complex<double>(1, -5.959897539826596e-117);
+    h_ref_erf(34) = Kokkos::complex<double>(-1, 5.959897539826596e-117);
+    h_ref_erf(35) = Kokkos::complex<double>(-1, -5.959897539826596e-117);
+    h_ref_erf(36) =
+        Kokkos::complex<double>(-9211.077162784413, 13667.93825589455);
+    h_ref_erf(37) =
+        Kokkos::complex<double>(-9211.077162784413, -13667.93825589455);
+    h_ref_erf(38) =
+        Kokkos::complex<double>(9211.077162784413, 13667.93825589455);
+    h_ref_erf(39) =
+        Kokkos::complex<double>(9211.077162784413, -13667.93825589455);
+    h_ref_erf(40) = Kokkos::complex<double>(259.38847811225, 35281.28906479814);
+    h_ref_erf(41) =
+        Kokkos::complex<double>(259.38847811225, -35281.28906479814);
+    h_ref_erf(42) =
+        Kokkos::complex<double>(-259.38847811225, 35281.28906479814);
+    h_ref_erf(43) =
+        Kokkos::complex<double>(-259.38847811225, -35281.28906479814);
+    h_ref_erf(44) =
+        Kokkos::complex<double>(6.752085728270252e+21, 9.809477366939276e+22);
+    h_ref_erf(45) =
+        Kokkos::complex<double>(6.752085728270252e+21, -9.809477366939276e+22);
+    h_ref_erf(46) =
+        Kokkos::complex<double>(-6.752085728270252e+21, 9.809477366939276e+22);
+    h_ref_erf(47) =
+        Kokkos::complex<double>(-6.752085728270252e+21, -9.809477366939276e+22);
+    h_ref_erf(48) =
+        Kokkos::complex<double>(4.37526734926942e+166, -2.16796709605852e+166);
+    h_ref_erf(49) =
+        Kokkos::complex<double>(4.37526734926942e+166, 2.16796709605852e+166);
+    h_ref_erf(50) =
+        Kokkos::complex<double>(-4.37526734926942e+166, -2.16796709605852e+166);
+    h_ref_erf(51) =
+        Kokkos::complex<double>(-4.37526734926942e+166, 2.16796709605852e+166);
+
+    h_ref_erfcx(0) = Kokkos::complex<double>(0.9987599919156778, 0);
+    h_ref_erfcx(1) = Kokkos::complex<double>(1.001242428085786, 0);
+    h_ref_erfcx(2) = Kokkos::complex<double>(0.3288157848563544, 0);
+    h_ref_erfcx(3) = Kokkos::complex<double>(16.36639786516915, 0);
+    h_ref_erfcx(4) =
+        Kokkos::complex<double>(0.999998790000732, -0.001241216082557101);
+    h_ref_erfcx(5) =
+        Kokkos::complex<double>(0.999998790000732, 0.001241216082557101);
+    h_ref_erfcx(6) =
+        Kokkos::complex<double>(0.1197948131677216, -0.4971192955307743);
+    h_ref_erfcx(7) =
+        Kokkos::complex<double>(0.1197948131677216, 0.4971192955307743);
+    h_ref_erfcx(8) =
+        Kokkos::complex<double>(0.3288156873503045, -0.0001874479383970247);
+    h_ref_erfcx(9) =
+        Kokkos::complex<double>(0.3288156873503045, 0.0001874479383970247);
+    h_ref_erfcx(10) =
+        Kokkos::complex<double>(16.36629202874158, -0.05369111060785572);
+    h_ref_erfcx(11) =
+        Kokkos::complex<double>(16.36629202874158, 0.05369111060785572);
+    h_ref_erfcx(12) =
+        Kokkos::complex<double>(0.3020886508118801, -0.09424097887578842);
+    h_ref_erfcx(13) =
+        Kokkos::complex<double>(0.3020886508118801, 0.09424097887578842);
+    h_ref_erfcx(14) =
+        Kokkos::complex<double>(-2.174707722732267, -11.67259764091796);
+    h_ref_erfcx(15) =
+        Kokkos::complex<double>(-2.174707722732267, 11.67259764091796);
+    h_ref_erfcx(16) =
+        Kokkos::complex<double>(0.7019810779371267, -0.5319516793968513);
+    h_ref_erfcx(17) =
+        Kokkos::complex<double>(0.7019810779371267, 0.5319516793968513);
+    h_ref_erfcx(18) =
+        Kokkos::complex<double>(0.7030703366403597, -0.5337884198542978);
+    h_ref_erfcx(19) =
+        Kokkos::complex<double>(0.7030703366403597, 0.5337884198542978);
+    h_ref_erfcx(20) =
+        Kokkos::complex<double>(0.9987340467266177, -0.005743428170378673);
+    h_ref_erfcx(21) =
+        Kokkos::complex<double>(0.9987340467266177, 0.005743428170378673);
+    h_ref_erfcx(22) =
+        Kokkos::complex<double>(1.001216353762532, -0.005765867613873103);
+    h_ref_erfcx(23) =
+        Kokkos::complex<double>(1.001216353762532, 0.005765867613873103);
+    h_ref_erfcx(24) =
+        Kokkos::complex<double>(0.1552936427089241, -4.545593205871305e-05);
+    h_ref_erfcx(25) =
+        Kokkos::complex<double>(0.1552936427089241, 4.545593205871305e-05);
+    h_ref_erfcx(26) =
+        Kokkos::complex<double>(417949.5262869648, -3218.276197742372);
+    h_ref_erfcx(27) =
+        Kokkos::complex<double>(417949.5262869648, 3218.276197742372);
+    h_ref_erfcx(28) =
+        Kokkos::complex<double>(0.01879467905925653, -0.0515934271478583);
+    h_ref_erfcx(29) =
+        Kokkos::complex<double>(0.01879467905925653, 0.0515934271478583);
+    h_ref_erfcx(30) =
+        Kokkos::complex<double>(-0.01879467905925653, -0.0515934271478583);
+    h_ref_erfcx(31) =
+        Kokkos::complex<double>(-0.01879467905925653, 0.0515934271478583);
+    h_ref_erfcx(32) =
+        Kokkos::complex<double>(0.02362328821805, -0.01209735551897239);
+    h_ref_erfcx(33) =
+        Kokkos::complex<double>(0.02362328821805, 0.01209735551897239);
+    h_ref_erfcx(34) = Kokkos::complex<double>(-2.304726099084567e+114,
+                                              -2.942443198107089e+114);
+    h_ref_erfcx(35) = Kokkos::complex<double>(-2.304726099084567e+114,
+                                              2.942443198107089e+114);
+    h_ref_erfcx(36) =
+        Kokkos::complex<double>(0.04174017523145063, -0.1569865319886248);
+    h_ref_erfcx(37) =
+        Kokkos::complex<double>(0.04174017523145063, 0.1569865319886248);
+    h_ref_erfcx(38) =
+        Kokkos::complex<double>(-0.04172154858670504, -0.156980085534407);
+    h_ref_erfcx(39) =
+        Kokkos::complex<double>(-0.04172154858670504, 0.156980085534407);
+    h_ref_erfcx(40) =
+        Kokkos::complex<double>(6.355803055239174e-05, -0.1688298297427782);
+    h_ref_erfcx(41) =
+        Kokkos::complex<double>(6.355803055239174e-05, 0.1688298297427782);
+    h_ref_erfcx(42) =
+        Kokkos::complex<double>(-5.398806789669434e-05, -0.168829903432947);
+    h_ref_erfcx(43) =
+        Kokkos::complex<double>(-5.398806789669434e-05, 0.168829903432947);
+    h_ref_erfcx(44) =
+        Kokkos::complex<double>(0.008645103282302355, -0.07490521021566741);
+    h_ref_erfcx(45) =
+        Kokkos::complex<double>(0.008645103282302355, 0.07490521021566741);
+    h_ref_erfcx(46) =
+        Kokkos::complex<double>(-0.008645103282302355, -0.07490521021566741);
+    h_ref_erfcx(47) =
+        Kokkos::complex<double>(-0.008645103282302355, 0.07490521021566741);
+    h_ref_erfcx(48) =
+        Kokkos::complex<double>(0.001238176693606428, -0.02862247416909219);
+    h_ref_erfcx(49) =
+        Kokkos::complex<double>(0.001238176693606428, 0.02862247416909219);
+    h_ref_erfcx(50) =
+        Kokkos::complex<double>(-0.001238176693606428, -0.02862247416909219);
+    h_ref_erfcx(51) =
+        Kokkos::complex<double>(-0.001238176693606428, 0.02862247416909219);
+
+    h_ref_erfcx_dbl(0) = infinity<double>::value;
+    h_ref_erfcx_dbl(1) = 8.062854217063865e+00;
+    h_ref_erfcx_dbl(2) = 1.0;
+    h_ref_erfcx_dbl(3) = 3.785374169292397e-01;
+    h_ref_erfcx_dbl(4) = 5.349189974656411e-02;
+    h_ref_erfcx_dbl(5) = 0.0;
+
+    for (int i = 0; i < 52; i++) {
+      EXPECT_LE(Kokkos::abs(h_erf(i) - h_ref_erf(i)),
+                Kokkos::abs(h_ref_erf(i)) * 1e-13);
+    }
+
+    for (int i = 0; i < 52; i++) {
+      EXPECT_LE(Kokkos::abs(h_erfcx(i) - h_ref_erfcx(i)),
+                Kokkos::abs(h_ref_erfcx(i)) * 1e-13);
+    }
+
+    EXPECT_EQ(h_erfcx_dbl(0), h_ref_erfcx_dbl(0));
+    EXPECT_EQ(h_erfcx_dbl(5), h_ref_erfcx_dbl(5));
+    for (int i = 1; i < 5; i++) {
+      EXPECT_LE(std::abs(h_erfcx_dbl(i) - h_ref_erfcx_dbl(i)),
+                std::abs(h_ref_erfcx_dbl(i)) * 1e-13);
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_erf(i)   = Kokkos::Experimental::erf(d_z(i));
+    d_erfcx(i) = Kokkos::Experimental::erfcx(d_z(i));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TestRealErfcxTag&, const int& /*i*/) const {
+    d_erfcx_dbl(0) = Kokkos::Experimental::erfcx(d_x(0));
+    d_erfcx_dbl(1) = Kokkos::Experimental::erfcx(d_x(1));
+    d_erfcx_dbl(2) = Kokkos::Experimental::erfcx(d_x(2));
+    d_erfcx_dbl(3) = Kokkos::Experimental::erfcx(d_x(3));
+    d_erfcx_dbl(4) = Kokkos::Experimental::erfcx(d_x(4));
+    d_erfcx_dbl(5) = Kokkos::Experimental::erfcx(d_x(5));
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexBesselJ0Y0Function {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_cbj0, d_cby0;
+  typename ViewType::HostMirror h_z, h_cbj0, h_cby0;
+  HostViewType h_ref_cbj0, h_ref_cby0;
+
+  ViewType d_z_large, d_cbj0_large, d_cby0_large;
+  typename ViewType::HostMirror h_z_large, h_cbj0_large, h_cby0_large;
+  HostViewType h_ref_cbj0_large, h_ref_cby0_large;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    int N      = 25;
+    d_z        = ViewType("d_z", N);
+    d_cbj0     = ViewType("d_cbj0", N);
+    d_cby0     = ViewType("d_cby0", N);
+    h_z        = Kokkos::create_mirror_view(d_z);
+    h_cbj0     = Kokkos::create_mirror_view(d_cbj0);
+    h_cby0     = Kokkos::create_mirror_view(d_cby0);
+    h_ref_cbj0 = HostViewType("h_ref_cbj0", N);
+    h_ref_cby0 = HostViewType("h_ref_cby0", N);
+
+    // Generate test inputs
+    h_z(0) = Kokkos::complex<double>(0.0, 0.0);
+    // abs(z)<=25
+    h_z(1)  = Kokkos::complex<double>(3.0, 2.0);
+    h_z(2)  = Kokkos::complex<double>(3.0, -2.0);
+    h_z(3)  = Kokkos::complex<double>(-3.0, 2.0);
+    h_z(4)  = Kokkos::complex<double>(-3.0, -2.0);
+    h_z(5)  = Kokkos::complex<double>(23.0, 10.0);
+    h_z(6)  = Kokkos::complex<double>(23.0, -10.0);
+    h_z(7)  = Kokkos::complex<double>(-23.0, 10.0);
+    h_z(8)  = Kokkos::complex<double>(-23.0, -10.0);
+    h_z(9)  = Kokkos::complex<double>(3.0, 0.0);
+    h_z(10) = Kokkos::complex<double>(-3.0, 0.0);
+    h_z(11) = Kokkos::complex<double>(23.0, 0.0);
+    h_z(12) = Kokkos::complex<double>(-23.0, 0.0);
+    // abs(z)>25
+    h_z(13) = Kokkos::complex<double>(28.0, 10.0);
+    h_z(14) = Kokkos::complex<double>(28.0, -10.0);
+    h_z(15) = Kokkos::complex<double>(-28.0, 10.0);
+    h_z(16) = Kokkos::complex<double>(-28.0, -10.0);
+    h_z(17) = Kokkos::complex<double>(60.0, 10.0);
+    h_z(18) = Kokkos::complex<double>(60.0, -10.0);
+    h_z(19) = Kokkos::complex<double>(-60.0, 10.0);
+    h_z(20) = Kokkos::complex<double>(-60.0, -10.0);
+    h_z(21) = Kokkos::complex<double>(28.0, 0.0);
+    h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
+    h_z(23) = Kokkos::complex<double>(60.0, 0.0);
+    h_z(24) = Kokkos::complex<double>(-60.0, 0.0);
+
+    Kokkos::deep_copy(d_z, h_z);
+
+    // Call Bessel functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbj0, d_cbj0);
+    Kokkos::deep_copy(h_cby0, d_cby0);
+
+    // Reference values computed with Octave
+    h_ref_cbj0(0) = Kokkos::complex<double>(1.000000000000000e+00, 0);
+    h_ref_cbj0(1) =
+        Kokkos::complex<double>(-1.249234879607422e+00, -9.479837920577351e-01);
+    h_ref_cbj0(2) =
+        Kokkos::complex<double>(-1.249234879607422e+00, +9.479837920577351e-01);
+    h_ref_cbj0(3) =
+        Kokkos::complex<double>(-1.249234879607422e+00, +9.479837920577351e-01);
+    h_ref_cbj0(4) =
+        Kokkos::complex<double>(-1.249234879607422e+00, -9.479837920577351e-01);
+    h_ref_cbj0(5) =
+        Kokkos::complex<double>(-1.602439981218195e+03, +7.230667451989807e+02);
+    h_ref_cbj0(6) =
+        Kokkos::complex<double>(-1.602439981218195e+03, -7.230667451989807e+02);
+    h_ref_cbj0(7) =
+        Kokkos::complex<double>(-1.602439981218195e+03, -7.230667451989807e+02);
+    h_ref_cbj0(8) =
+        Kokkos::complex<double>(-1.602439981218195e+03, +7.230667451989807e+02);
+    h_ref_cbj0(9) = Kokkos::complex<double>(-2.600519549019335e-01, 0);
+    h_ref_cbj0(10) =
+        Kokkos::complex<double>(-2.600519549019335e-01, +9.951051106466461e-18);
+    h_ref_cbj0(11) = Kokkos::complex<double>(-1.624127813134866e-01, 0);
+    h_ref_cbj0(12) =
+        Kokkos::complex<double>(-1.624127813134866e-01, -1.387778780781446e-17);
+    h_ref_cbj0(13) =
+        Kokkos::complex<double>(-1.012912188513958e+03, -1.256239636146142e+03);
+    h_ref_cbj0(14) =
+        Kokkos::complex<double>(-1.012912188513958e+03, +1.256239636146142e+03);
+    h_ref_cbj0(15) =
+        Kokkos::complex<double>(-1.012912188513958e+03, +1.256239636146142e+03);
+    h_ref_cbj0(16) =
+        Kokkos::complex<double>(-1.012912188513958e+03, -1.256239636146142e+03);
+    h_ref_cbj0(17) =
+        Kokkos::complex<double>(-1.040215134669324e+03, -4.338202386810095e+02);
+    h_ref_cbj0(18) =
+        Kokkos::complex<double>(-1.040215134669324e+03, +4.338202386810095e+02);
+    h_ref_cbj0(19) =
+        Kokkos::complex<double>(-1.040215134669324e+03, +4.338202386810095e+02);
+    h_ref_cbj0(20) =
+        Kokkos::complex<double>(-1.040215134669324e+03, -4.338202386810095e+02);
+    h_ref_cbj0(21) = Kokkos::complex<double>(-7.315701054899962e-02, 0);
+    h_ref_cbj0(22) =
+        Kokkos::complex<double>(-7.315701054899962e-02, -6.938893903907228e-18);
+    h_ref_cbj0(23) = Kokkos::complex<double>(-9.147180408906189e-02, 0);
+    h_ref_cbj0(24) =
+        Kokkos::complex<double>(-9.147180408906189e-02, +1.387778780781446e-17);
+
+    h_ref_cby0(0) = Kokkos::complex<double>(-infinity<double>::value, 0);
+    h_ref_cby0(1) =
+        Kokkos::complex<double>(1.000803196554890e+00, -1.231441609303427e+00);
+    h_ref_cby0(2) =
+        Kokkos::complex<double>(1.000803196554890e+00, +1.231441609303427e+00);
+    h_ref_cby0(3) =
+        Kokkos::complex<double>(-8.951643875605797e-01, -1.267028149911417e+00);
+    h_ref_cby0(4) =
+        Kokkos::complex<double>(-8.951643875605797e-01, +1.267028149911417e+00);
+    h_ref_cby0(5) =
+        Kokkos::complex<double>(-7.230667452992603e+02, -1.602439974000479e+03);
+    h_ref_cby0(6) =
+        Kokkos::complex<double>(-7.230667452992603e+02, +1.602439974000479e+03);
+    h_ref_cby0(7) =
+        Kokkos::complex<double>(7.230667450987011e+02, -1.602439988435912e+03);
+    h_ref_cby0(8) =
+        Kokkos::complex<double>(7.230667450987011e+02, +1.602439988435912e+03);
+    h_ref_cby0(9) = Kokkos::complex<double>(3.768500100127903e-01, 0);
+    h_ref_cby0(10) =
+        Kokkos::complex<double>(3.768500100127903e-01, -5.201039098038670e-01);
+    h_ref_cby0(11) = Kokkos::complex<double>(-3.598179027370283e-02, 0);
+    h_ref_cby0(12) =
+        Kokkos::complex<double>(-3.598179027370282e-02, -3.248255626269732e-01);
+    h_ref_cby0(13) =
+        Kokkos::complex<double>(1.256239642409530e+03, -1.012912186329053e+03);
+    h_ref_cby0(14) =
+        Kokkos::complex<double>(1.256239642409530e+03, +1.012912186329053e+03);
+    h_ref_cby0(15) =
+        Kokkos::complex<double>(-1.256239629882755e+03, -1.012912190698863e+03);
+    h_ref_cby0(16) =
+        Kokkos::complex<double>(-1.256239629882755e+03, +1.012912190698863e+03);
+    h_ref_cby0(17) =
+        Kokkos::complex<double>(4.338202411482646e+02, -1.040215130736213e+03);
+    h_ref_cby0(18) =
+        Kokkos::complex<double>(4.338202411482646e+02, +1.040215130736213e+03);
+    h_ref_cby0(19) =
+        Kokkos::complex<double>(-4.338202362137545e+02, -1.040215138602435e+03);
+    h_ref_cby0(20) =
+        Kokkos::complex<double>(-4.338202362137545e+02, +1.040215138602435e+03);
+    h_ref_cby0(21) = Kokkos::complex<double>(1.318364704235323e-01, 0);
+    h_ref_cby0(22) =
+        Kokkos::complex<double>(1.318364704235323e-01, -1.463140210979992e-01);
+    h_ref_cby0(23) = Kokkos::complex<double>(4.735895220944939e-02, 0);
+    h_ref_cby0(24) =
+        Kokkos::complex<double>(4.735895220944938e-02, -1.829436081781237e-01);
+
+    for (int i = 0; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cbj0(i) - h_ref_cbj0(i)),
+                Kokkos::abs(h_ref_cbj0(i)) * 1e-13);
+    }
+
+    EXPECT_EQ(h_ref_cby0(0), h_cby0(0));
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cby0(i) - h_ref_cby0(i)),
+                Kokkos::abs(h_ref_cby0(i)) * 1e-13);
+    }
+
+    ////Test large arguments
+    d_z_large        = ViewType("d_z_large", 6);
+    d_cbj0_large     = ViewType("d_cbj0_large", 6);
+    d_cby0_large     = ViewType("d_cby0_large", 6);
+    h_z_large        = Kokkos::create_mirror_view(d_z_large);
+    h_cbj0_large     = Kokkos::create_mirror_view(d_cbj0_large);
+    h_cby0_large     = Kokkos::create_mirror_view(d_cby0_large);
+    h_ref_cbj0_large = HostViewType("h_ref_cbj0_large", 2);
+    h_ref_cby0_large = HostViewType("h_ref_cby0_large", 2);
+
+    h_z_large(0) = Kokkos::complex<double>(10000.0, 100.0);
+    h_z_large(1) = Kokkos::complex<double>(10000.0, 100.0);
+    h_z_large(2) = Kokkos::complex<double>(10000.0, 100.0);
+    h_z_large(3) = Kokkos::complex<double>(-10000.0, 100.0);
+    h_z_large(4) = Kokkos::complex<double>(-10000.0, 100.0);
+    h_z_large(5) = Kokkos::complex<double>(-10000.0, 100.0);
+
+    Kokkos::deep_copy(d_z_large, h_z_large);
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, TestLargeArgTag>(0, 1),
+                         *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbj0_large, d_cbj0_large);
+    Kokkos::deep_copy(h_cby0_large, d_cby0_large);
+
+    h_ref_cbj0_large(0) =
+        Kokkos::complex<double>(-9.561811498244175e+40, -4.854995782103029e+40);
+    h_ref_cbj0_large(1) =
+        Kokkos::complex<double>(-9.561811498244175e+40, +4.854995782103029e+40);
+
+    h_ref_cby0_large(0) =
+        Kokkos::complex<double>(4.854995782103029e+40, -9.561811498244175e+40);
+    h_ref_cby0_large(1) =
+        Kokkos::complex<double>(-4.854995782103029e+40, -9.561811498244175e+40);
+
+    EXPECT_TRUE((Kokkos::abs(h_cbj0_large(0) - h_ref_cbj0_large(0)) <
+                 Kokkos::abs(h_ref_cbj0_large(0)) * 1e-12) &&
+                (Kokkos::abs(h_cbj0_large(0) - h_ref_cbj0_large(0)) >
+                 Kokkos::abs(h_ref_cbj0_large(0)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cbj0_large(1) - h_ref_cbj0_large(0)) >
+                Kokkos::abs(h_ref_cbj0_large(0)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cbj0_large(2) - h_ref_cbj0_large(0)) <
+                Kokkos::abs(h_ref_cbj0_large(0)) * 1e-13);
+    EXPECT_TRUE((Kokkos::abs(h_cbj0_large(3) - h_ref_cbj0_large(1)) <
+                 Kokkos::abs(h_ref_cbj0_large(1)) * 1e-12) &&
+                (Kokkos::abs(h_cbj0_large(3) - h_ref_cbj0_large(1)) >
+                 Kokkos::abs(h_ref_cbj0_large(1)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cbj0_large(4) - h_ref_cbj0_large(1)) >
+                Kokkos::abs(h_ref_cbj0_large(1)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cbj0_large(5) - h_ref_cbj0_large(1)) <
+                Kokkos::abs(h_ref_cbj0_large(1)) * 1e-13);
+
+    EXPECT_TRUE((Kokkos::abs(h_cby0_large(0) - h_ref_cby0_large(0)) <
+                 Kokkos::abs(h_ref_cby0_large(0)) * 1e-12) &&
+                (Kokkos::abs(h_cby0_large(0) - h_ref_cby0_large(0)) >
+                 Kokkos::abs(h_ref_cby0_large(0)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cby0_large(1) - h_ref_cby0_large(0)) >
+                Kokkos::abs(h_ref_cby0_large(0)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cby0_large(2) - h_ref_cby0_large(0)) <
+                Kokkos::abs(h_ref_cby0_large(0)) * 1e-13);
+    EXPECT_TRUE((Kokkos::abs(h_cby0_large(3) - h_ref_cby0_large(1)) <
+                 Kokkos::abs(h_ref_cby0_large(1)) * 1e-12) &&
+                (Kokkos::abs(h_cby0_large(3) - h_ref_cby0_large(1)) >
+                 Kokkos::abs(h_ref_cby0_large(1)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cby0_large(4) - h_ref_cby0_large(1)) >
+                Kokkos::abs(h_ref_cby0_large(1)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cby0_large(5) - h_ref_cby0_large(1)) <
+                Kokkos::abs(h_ref_cby0_large(1)) * 1e-13);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_cbj0(i) = Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+    d_cby0(i) = Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TestLargeArgTag&, const int& /*i*/) const {
+    d_cbj0_large(0) =
+        Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(0));
+    d_cbj0_large(1) =
+        Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(1), 11000, 3000);
+    d_cbj0_large(2) =
+        Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(2), 11000, 7500);
+    d_cbj0_large(3) =
+        Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(3));
+    d_cbj0_large(4) =
+        Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(4), 11000, 3000);
+    d_cbj0_large(5) =
+        Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(5), 11000, 7500);
+
+    d_cby0_large(0) =
+        Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(0));
+    d_cby0_large(1) =
+        Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(1), 11000, 3000);
+    d_cby0_large(2) =
+        Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(2), 11000, 7500);
+    d_cby0_large(3) =
+        Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(3));
+    d_cby0_large(4) =
+        Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(4), 11000, 3000);
+    d_cby0_large(5) =
+        Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(5), 11000, 7500);
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexBesselJ1Y1Function {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_cbj1, d_cby1;
+  typename ViewType::HostMirror h_z, h_cbj1, h_cby1;
+  HostViewType h_ref_cbj1, h_ref_cby1;
+
+  ViewType d_z_large, d_cbj1_large, d_cby1_large;
+  typename ViewType::HostMirror h_z_large, h_cbj1_large, h_cby1_large;
+  HostViewType h_ref_cbj1_large, h_ref_cby1_large;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    int N      = 25;
+    d_z        = ViewType("d_z", N);
+    d_cbj1     = ViewType("d_cbj1", N);
+    d_cby1     = ViewType("d_cby1", N);
+    h_z        = Kokkos::create_mirror_view(d_z);
+    h_cbj1     = Kokkos::create_mirror_view(d_cbj1);
+    h_cby1     = Kokkos::create_mirror_view(d_cby1);
+    h_ref_cbj1 = HostViewType("h_ref_cbj1", N);
+    h_ref_cby1 = HostViewType("h_ref_cby1", N);
+
+    // Generate test inputs
+    h_z(0) = Kokkos::complex<double>(0.0, 0.0);
+    // abs(z)<=25
+    h_z(1)  = Kokkos::complex<double>(3.0, 2.0);
+    h_z(2)  = Kokkos::complex<double>(3.0, -2.0);
+    h_z(3)  = Kokkos::complex<double>(-3.0, 2.0);
+    h_z(4)  = Kokkos::complex<double>(-3.0, -2.0);
+    h_z(5)  = Kokkos::complex<double>(23.0, 10.0);
+    h_z(6)  = Kokkos::complex<double>(23.0, -10.0);
+    h_z(7)  = Kokkos::complex<double>(-23.0, 10.0);
+    h_z(8)  = Kokkos::complex<double>(-23.0, -10.0);
+    h_z(9)  = Kokkos::complex<double>(3.0, 0.0);
+    h_z(10) = Kokkos::complex<double>(-3.0, 0.0);
+    h_z(11) = Kokkos::complex<double>(23.0, 0.0);
+    h_z(12) = Kokkos::complex<double>(-23.0, 0.0);
+    // abs(z)>25
+    h_z(13) = Kokkos::complex<double>(28.0, 10.0);
+    h_z(14) = Kokkos::complex<double>(28.0, -10.0);
+    h_z(15) = Kokkos::complex<double>(-28.0, 10.0);
+    h_z(16) = Kokkos::complex<double>(-28.0, -10.0);
+    h_z(17) = Kokkos::complex<double>(60.0, 10.0);
+    h_z(18) = Kokkos::complex<double>(60.0, -10.0);
+    h_z(19) = Kokkos::complex<double>(-60.0, 10.0);
+    h_z(20) = Kokkos::complex<double>(-60.0, -10.0);
+    h_z(21) = Kokkos::complex<double>(28.0, 0.0);
+    h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
+    h_z(23) = Kokkos::complex<double>(60.0, 0.0);
+    h_z(24) = Kokkos::complex<double>(-60.0, 0.0);
+
+    Kokkos::deep_copy(d_z, h_z);
+
+    // Call Bessel functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbj1, d_cbj1);
+    Kokkos::deep_copy(h_cby1, d_cby1);
+
+    // Reference values computed with Octave
+    h_ref_cbj1(0) = Kokkos::complex<double>(0, 0);
+    h_ref_cbj1(1) =
+        Kokkos::complex<double>(7.801488485792540e-01, -1.260982060238848e+00);
+    h_ref_cbj1(2) =
+        Kokkos::complex<double>(7.801488485792540e-01, +1.260982060238848e+00);
+    h_ref_cbj1(3) =
+        Kokkos::complex<double>(-7.801488485792543e-01, -1.260982060238848e+00);
+    h_ref_cbj1(4) =
+        Kokkos::complex<double>(-7.801488485792543e-01, +1.260982060238848e+00);
+    h_ref_cbj1(5) =
+        Kokkos::complex<double>(-7.469476253429664e+02, -1.576608505254311e+03);
+    h_ref_cbj1(6) =
+        Kokkos::complex<double>(-7.469476253429664e+02, +1.576608505254311e+03);
+    h_ref_cbj1(7) =
+        Kokkos::complex<double>(7.469476253429661e+02, -1.576608505254311e+03);
+    h_ref_cbj1(8) =
+        Kokkos::complex<double>(7.469476253429661e+02, +1.576608505254311e+03);
+    h_ref_cbj1(9) = Kokkos::complex<double>(3.390589585259365e-01, 0);
+    h_ref_cbj1(10) =
+        Kokkos::complex<double>(-3.390589585259365e-01, +3.373499138396203e-17);
+    h_ref_cbj1(11) = Kokkos::complex<double>(-3.951932188370151e-02, 0);
+    h_ref_cbj1(12) =
+        Kokkos::complex<double>(3.951932188370151e-02, +7.988560221984213e-18);
+    h_ref_cbj1(13) =
+        Kokkos::complex<double>(1.233147100257312e+03, -1.027302265904111e+03);
+    h_ref_cbj1(14) =
+        Kokkos::complex<double>(1.233147100257312e+03, +1.027302265904111e+03);
+    h_ref_cbj1(15) =
+        Kokkos::complex<double>(-1.233147100257312e+03, -1.027302265904111e+03);
+    h_ref_cbj1(16) =
+        Kokkos::complex<double>(-1.233147100257312e+03, +1.027302265904111e+03);
+    h_ref_cbj1(17) =
+        Kokkos::complex<double>(4.248029136732908e+02, -1.042364939115052e+03);
+    h_ref_cbj1(18) =
+        Kokkos::complex<double>(4.248029136732908e+02, +1.042364939115052e+03);
+    h_ref_cbj1(19) =
+        Kokkos::complex<double>(-4.248029136732909e+02, -1.042364939115052e+03);
+    h_ref_cbj1(20) =
+        Kokkos::complex<double>(-4.248029136732909e+02, +1.042364939115052e+03);
+    h_ref_cbj1(21) = Kokkos::complex<double>(1.305514883350938e-01, 0);
+    h_ref_cbj1(22) =
+        Kokkos::complex<double>(-1.305514883350938e-01, +7.993709105806192e-18);
+    h_ref_cbj1(23) = Kokkos::complex<double>(4.659838375816632e-02, 0);
+    h_ref_cbj1(24) =
+        Kokkos::complex<double>(-4.659838375816632e-02, +6.322680793358811e-18);
+
+    h_ref_cby1(0) = Kokkos::complex<double>(-infinity<double>::value, 0);
+    h_ref_cby1(1) =
+        Kokkos::complex<double>(1.285849341463599e+00, +7.250812532419394e-01);
+    h_ref_cby1(2) =
+        Kokkos::complex<double>(1.285849341463599e+00, -7.250812532419394e-01);
+    h_ref_cby1(3) =
+        Kokkos::complex<double>(1.236114779014097e+00, -8.352164439165690e-01);
+    h_ref_cby1(4) =
+        Kokkos::complex<double>(1.236114779014097e+00, +8.352164439165690e-01);
+    h_ref_cby1(5) =
+        Kokkos::complex<double>(1.576608512528508e+03, -7.469476251109801e+02);
+    h_ref_cby1(6) =
+        Kokkos::complex<double>(1.576608512528508e+03, +7.469476251109801e+02);
+    h_ref_cby1(7) =
+        Kokkos::complex<double>(1.576608497980113e+03, +7.469476255749524e+02);
+    h_ref_cby1(8) =
+        Kokkos::complex<double>(1.576608497980113e+03, -7.469476255749524e+02);
+    h_ref_cby1(9) = Kokkos::complex<double>(3.246744247918000e-01, 0);
+    h_ref_cby1(10) =
+        Kokkos::complex<double>(-3.246744247918000e-01, -6.781179170518730e-01);
+    h_ref_cby1(11) = Kokkos::complex<double>(1.616692009926331e-01, 0);
+    h_ref_cby1(12) =
+        Kokkos::complex<double>(-1.616692009926332e-01, +7.903864376740302e-02);
+    h_ref_cby1(13) =
+        Kokkos::complex<double>(1.027302268200224e+03, +1.233147093992241e+03);
+    h_ref_cby1(14) =
+        Kokkos::complex<double>(1.027302268200224e+03, -1.233147093992241e+03);
+    h_ref_cby1(15) =
+        Kokkos::complex<double>(1.027302263607999e+03, -1.233147106522383e+03);
+    h_ref_cby1(16) =
+        Kokkos::complex<double>(1.027302263607999e+03, +1.233147106522383e+03);
+    h_ref_cby1(17) =
+        Kokkos::complex<double>(1.042364943073579e+03, +4.248029112344685e+02);
+    h_ref_cby1(18) =
+        Kokkos::complex<double>(1.042364943073579e+03, -4.248029112344685e+02);
+    h_ref_cby1(19) =
+        Kokkos::complex<double>(1.042364935156525e+03, -4.248029161121132e+02);
+    h_ref_cby1(20) =
+        Kokkos::complex<double>(1.042364935156525e+03, +4.248029161121132e+02);
+    h_ref_cby1(21) = Kokkos::complex<double>(7.552212658226459e-02, 0);
+    h_ref_cby1(22) =
+        Kokkos::complex<double>(-7.552212658226459e-02, -2.611029766701876e-01);
+    h_ref_cby1(23) = Kokkos::complex<double>(9.186960936986688e-02, 0);
+    h_ref_cby1(24) =
+        Kokkos::complex<double>(-9.186960936986688e-02, -9.319676751633262e-02);
+
+    for (int i = 0; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cbj1(i) - h_ref_cbj1(i)),
+                Kokkos::abs(h_ref_cbj1(i)) * 1e-13);
+    }
+
+    EXPECT_EQ(h_ref_cby1(0), h_cby1(0));
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cby1(i) - h_ref_cby1(i)),
+                Kokkos::abs(h_ref_cby1(i)) * 1e-13);
+    }
+
+    ////Test large arguments
+    d_z_large        = ViewType("d_z_large", 6);
+    d_cbj1_large     = ViewType("d_cbj1_large", 6);
+    d_cby1_large     = ViewType("d_cby1_large", 6);
+    h_z_large        = Kokkos::create_mirror_view(d_z_large);
+    h_cbj1_large     = Kokkos::create_mirror_view(d_cbj1_large);
+    h_cby1_large     = Kokkos::create_mirror_view(d_cby1_large);
+    h_ref_cbj1_large = HostViewType("h_ref_cbj1_large", 2);
+    h_ref_cby1_large = HostViewType("h_ref_cby1_large", 2);
+
+    h_z_large(0) = Kokkos::complex<double>(10000.0, 100.0);
+    h_z_large(1) = Kokkos::complex<double>(10000.0, 100.0);
+    h_z_large(2) = Kokkos::complex<double>(10000.0, 100.0);
+    h_z_large(3) = Kokkos::complex<double>(-10000.0, 100.0);
+    h_z_large(4) = Kokkos::complex<double>(-10000.0, 100.0);
+    h_z_large(5) = Kokkos::complex<double>(-10000.0, 100.0);
+
+    Kokkos::deep_copy(d_z_large, h_z_large);
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, TestLargeArgTag>(0, 1),
+                         *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbj1_large, d_cbj1_large);
+    Kokkos::deep_copy(h_cby1_large, d_cby1_large);
+
+    h_ref_cbj1_large(0) =
+        Kokkos::complex<double>(4.854515317906369e+40, -9.562049455402486e+40);
+    h_ref_cbj1_large(1) =
+        Kokkos::complex<double>(-4.854515317906371e+40, -9.562049455402486e+40);
+
+    h_ref_cby1_large(0) =
+        Kokkos::complex<double>(9.562049455402486e+40, 4.854515317906369e+40);
+    h_ref_cby1_large(1) =
+        Kokkos::complex<double>(9.562049455402486e+40, -4.854515317906369e+40);
+
+    EXPECT_TRUE((Kokkos::abs(h_cbj1_large(0) - h_ref_cbj1_large(0)) <
+                 Kokkos::abs(h_ref_cbj1_large(0)) * 1e-12) &&
+                (Kokkos::abs(h_cbj1_large(0) - h_ref_cbj1_large(0)) >
+                 Kokkos::abs(h_ref_cbj1_large(0)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cbj1_large(1) - h_ref_cbj1_large(0)) >
+                Kokkos::abs(h_ref_cbj1_large(0)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cbj1_large(2) - h_ref_cbj1_large(0)) <
+                Kokkos::abs(h_ref_cbj1_large(0)) * 1e-13);
+    EXPECT_TRUE((Kokkos::abs(h_cbj1_large(3) - h_ref_cbj1_large(1)) <
+                 Kokkos::abs(h_ref_cbj1_large(1)) * 1e-12) &&
+                (Kokkos::abs(h_cbj1_large(3) - h_ref_cbj1_large(1)) >
+                 Kokkos::abs(h_ref_cbj1_large(1)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cbj1_large(4) - h_ref_cbj1_large(1)) >
+                Kokkos::abs(h_ref_cbj1_large(1)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cbj1_large(5) - h_ref_cbj1_large(1)) <
+                Kokkos::abs(h_ref_cbj1_large(1)) * 1e-13);
+
+    EXPECT_TRUE((Kokkos::abs(h_cby1_large(0) - h_ref_cby1_large(0)) <
+                 Kokkos::abs(h_ref_cby1_large(0)) * 1e-12) &&
+                (Kokkos::abs(h_cby1_large(0) - h_ref_cby1_large(0)) >
+                 Kokkos::abs(h_ref_cby1_large(0)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cby1_large(1) - h_ref_cby1_large(0)) >
+                Kokkos::abs(h_ref_cby1_large(0)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cby1_large(2) - h_ref_cby1_large(0)) <
+                Kokkos::abs(h_ref_cby1_large(0)) * 1e-13);
+    EXPECT_TRUE((Kokkos::abs(h_cby1_large(3) - h_ref_cby1_large(1)) <
+                 Kokkos::abs(h_ref_cby1_large(1)) * 1e-12) &&
+                (Kokkos::abs(h_cby1_large(3) - h_ref_cby1_large(1)) >
+                 Kokkos::abs(h_ref_cby1_large(1)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cby1_large(4) - h_ref_cby1_large(1)) >
+                Kokkos::abs(h_ref_cby1_large(1)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cby1_large(5) - h_ref_cby1_large(1)) <
+                Kokkos::abs(h_ref_cby1_large(1)) * 1e-13);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_cbj1(i) = Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+    d_cby1(i) = Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TestLargeArgTag&, const int& /*i*/) const {
+    d_cbj1_large(0) =
+        Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(0));
+    d_cbj1_large(1) =
+        Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(1), 11000, 3000);
+    d_cbj1_large(2) =
+        Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(2), 11000, 7500);
+    d_cbj1_large(3) =
+        Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(3));
+    d_cbj1_large(4) =
+        Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(4), 11000, 3000);
+    d_cbj1_large(5) =
+        Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(5), 11000, 7500);
+
+    d_cby1_large(0) =
+        Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(0));
+    d_cby1_large(1) =
+        Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(1), 11000, 3000);
+    d_cby1_large(2) =
+        Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(2), 11000, 7500);
+    d_cby1_large(3) =
+        Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(3));
+    d_cby1_large(4) =
+        Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(4), 11000, 3000);
+    d_cby1_large(5) =
+        Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(5), 11000, 7500);
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexBesselI0K0Function {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_cbi0, d_cbk0;
+  typename ViewType::HostMirror h_z, h_cbi0, h_cbk0;
+  HostViewType h_ref_cbi0, h_ref_cbk0;
+
+  ViewType d_z_large, d_cbi0_large, d_cbk0_large;
+  typename ViewType::HostMirror h_z_large, h_cbi0_large, h_cbk0_large;
+  HostViewType h_ref_cbi0_large, h_ref_cbk0_large;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    int N      = 25;
+    d_z        = ViewType("d_z", N);
+    d_cbi0     = ViewType("d_cbi0", N);
+    d_cbk0     = ViewType("d_cbk0", N);
+    h_z        = Kokkos::create_mirror_view(d_z);
+    h_cbi0     = Kokkos::create_mirror_view(d_cbi0);
+    h_cbk0     = Kokkos::create_mirror_view(d_cbk0);
+    h_ref_cbi0 = HostViewType("h_ref_cbi0", N);
+    h_ref_cbk0 = HostViewType("h_ref_cbk0", N);
+
+    // Generate test inputs
+    h_z(0)  = Kokkos::complex<double>(0.0, 0.0);
+    h_z(1)  = Kokkos::complex<double>(3.0, 2.0);
+    h_z(2)  = Kokkos::complex<double>(3.0, -2.0);
+    h_z(3)  = Kokkos::complex<double>(-3.0, 2.0);
+    h_z(4)  = Kokkos::complex<double>(-3.0, -2.0);
+    h_z(5)  = Kokkos::complex<double>(23.0, 10.0);
+    h_z(6)  = Kokkos::complex<double>(23.0, -10.0);
+    h_z(7)  = Kokkos::complex<double>(-23.0, 10.0);
+    h_z(8)  = Kokkos::complex<double>(-23.0, -10.0);
+    h_z(9)  = Kokkos::complex<double>(3.0, 0.0);
+    h_z(10) = Kokkos::complex<double>(-3.0, 0.0);
+    h_z(11) = Kokkos::complex<double>(23.0, 0.0);
+    h_z(12) = Kokkos::complex<double>(-23.0, 0.0);
+    h_z(13) = Kokkos::complex<double>(28.0, 10.0);
+    h_z(14) = Kokkos::complex<double>(28.0, -10.0);
+    h_z(15) = Kokkos::complex<double>(-28.0, 10.0);
+    h_z(16) = Kokkos::complex<double>(-28.0, -10.0);
+    h_z(17) = Kokkos::complex<double>(60.0, 10.0);
+    h_z(18) = Kokkos::complex<double>(60.0, -10.0);
+    h_z(19) = Kokkos::complex<double>(-60.0, 10.0);
+    h_z(20) = Kokkos::complex<double>(-60.0, -10.0);
+    h_z(21) = Kokkos::complex<double>(28.0, 0.0);
+    h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
+    h_z(23) = Kokkos::complex<double>(60.0, 0.0);
+    h_z(24) = Kokkos::complex<double>(-60.0, 0.0);
+
+    Kokkos::deep_copy(d_z, h_z);
+
+    // Call Bessel functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbi0, d_cbi0);
+    Kokkos::deep_copy(h_cbk0, d_cbk0);
+
+    // Reference values computed with Octave
+    h_ref_cbi0(0) = Kokkos::complex<double>(1.000000000000000e+00, 0);
+    h_ref_cbi0(1) =
+        Kokkos::complex<double>(-4.695171920440706e-01, +4.313788409468920e+00);
+    h_ref_cbi0(2) =
+        Kokkos::complex<double>(-4.695171920440706e-01, -4.313788409468920e+00);
+    h_ref_cbi0(3) =
+        Kokkos::complex<double>(-4.695171920440706e-01, -4.313788409468920e+00);
+    h_ref_cbi0(4) =
+        Kokkos::complex<double>(-4.695171920440706e-01, +4.313788409468920e+00);
+    h_ref_cbi0(5) =
+        Kokkos::complex<double>(-7.276526052028507e+08, -2.806354803468570e+08);
+    h_ref_cbi0(6) =
+        Kokkos::complex<double>(-7.276526052028507e+08, +2.806354803468570e+08);
+    h_ref_cbi0(7) =
+        Kokkos::complex<double>(-7.276526052028507e+08, +2.806354803468570e+08);
+    h_ref_cbi0(8) =
+        Kokkos::complex<double>(-7.276526052028507e+08, -2.806354803468570e+08);
+    h_ref_cbi0(9)  = Kokkos::complex<double>(4.880792585865025e+00, 0);
+    h_ref_cbi0(10) = Kokkos::complex<double>(4.880792585865025e+00, 0);
+    h_ref_cbi0(11) = Kokkos::complex<double>(8.151421225128924e+08, 0);
+    h_ref_cbi0(12) = Kokkos::complex<double>(8.151421225128924e+08, 0);
+    h_ref_cbi0(13) =
+        Kokkos::complex<double>(-9.775983282455373e+10, -4.159160389327644e+10);
+    h_ref_cbi0(14) =
+        Kokkos::complex<double>(-9.775983282455373e+10, +4.159160389327644e+10);
+    h_ref_cbi0(15) =
+        Kokkos::complex<double>(-9.775983282455373e+10, +4.159160389327644e+10);
+    h_ref_cbi0(16) =
+        Kokkos::complex<double>(-9.775983282455373e+10, -4.159160389327644e+10);
+    h_ref_cbi0(17) =
+        Kokkos::complex<double>(-5.158377566681892e+24, -2.766704059464302e+24);
+    h_ref_cbi0(18) =
+        Kokkos::complex<double>(-5.158377566681892e+24, +2.766704059464302e+24);
+    h_ref_cbi0(19) =
+        Kokkos::complex<double>(-5.158377566681892e+24, +2.766704059464302e+24);
+    h_ref_cbi0(20) =
+        Kokkos::complex<double>(-5.158377566681892e+24, -2.766704059464302e+24);
+    h_ref_cbi0(21) = Kokkos::complex<double>(1.095346047317573e+11, 0);
+    h_ref_cbi0(22) = Kokkos::complex<double>(1.095346047317573e+11, 0);
+    h_ref_cbi0(23) = Kokkos::complex<double>(5.894077055609803e+24, 0);
+    h_ref_cbi0(24) = Kokkos::complex<double>(5.894077055609803e+24, 0);
+
+    h_ref_cbk0(0) = Kokkos::complex<double>(infinity<double>::value, 0);
+    h_ref_cbk0(1) =
+        Kokkos::complex<double>(-2.078722558742977e-02, -2.431266356716766e-02);
+    h_ref_cbk0(2) =
+        Kokkos::complex<double>(-2.078722558742977e-02, +2.431266356716766e-02);
+    h_ref_cbk0(3) =
+        Kokkos::complex<double>(-1.357295320191579e+01, +1.499344424826928e+00);
+    h_ref_cbk0(4) =
+        Kokkos::complex<double>(-1.357295320191579e+01, -1.499344424826928e+00);
+    h_ref_cbk0(5) =
+        Kokkos::complex<double>(-1.820476218131465e-11, +1.795056004780177e-11);
+    h_ref_cbk0(6) =
+        Kokkos::complex<double>(-1.820476218131465e-11, -1.795056004780177e-11);
+    h_ref_cbk0(7) =
+        Kokkos::complex<double>(8.816423633943287e+08, +2.285988078870750e+09);
+    h_ref_cbk0(8) =
+        Kokkos::complex<double>(8.816423633943287e+08, -2.285988078870750e+09);
+    h_ref_cbk0(9) = Kokkos::complex<double>(3.473950438627926e-02, 0);
+    h_ref_cbk0(10) =
+        Kokkos::complex<double>(3.473950438627926e-02, -1.533346213144909e+01);
+    h_ref_cbk0(11) = Kokkos::complex<double>(2.667545110351910e-11, 0);
+    h_ref_cbk0(12) =
+        Kokkos::complex<double>(2.667545110351910e-11, -2.560844503718094e+09);
+    h_ref_cbk0(13) =
+        Kokkos::complex<double>(-1.163319528590747e-13, +1.073711234918388e-13);
+    h_ref_cbk0(14) =
+        Kokkos::complex<double>(-1.163319528590747e-13, -1.073711234918388e-13);
+    h_ref_cbk0(15) =
+        Kokkos::complex<double>(1.306638772421339e+11, +3.071215726177843e+11);
+    h_ref_cbk0(16) =
+        Kokkos::complex<double>(1.306638772421339e+11, -3.071215726177843e+11);
+    h_ref_cbk0(17) =
+        Kokkos::complex<double>(-1.111584549467388e-27, +8.581979311477652e-28);
+    h_ref_cbk0(18) =
+        Kokkos::complex<double>(-1.111584549467388e-27, -8.581979311477652e-28);
+    h_ref_cbk0(19) =
+        Kokkos::complex<double>(8.691857147870108e+24, +1.620552106793022e+25);
+    h_ref_cbk0(20) =
+        Kokkos::complex<double>(8.691857147870108e+24, -1.620552106793022e+25);
+    h_ref_cbk0(21) = Kokkos::complex<double>(1.630534586888181e-13, 0);
+    h_ref_cbk0(22) =
+        Kokkos::complex<double>(1.630534586888181e-13, -3.441131095391506e+11);
+    h_ref_cbk0(23) = Kokkos::complex<double>(1.413897840559108e-27, 0);
+    h_ref_cbk0(24) =
+        Kokkos::complex<double>(1.413897840559108e-27, -1.851678917759592e+25);
+
+    for (int i = 0; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cbi0(i) - h_ref_cbi0(i)),
+                Kokkos::abs(h_ref_cbi0(i)) * 1e-13);
+    }
+
+    EXPECT_EQ(h_ref_cbk0(0), h_cbk0(0));
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cbk0(i) - h_ref_cbk0(i)),
+                Kokkos::abs(h_ref_cbk0(i)) * 1e-13);
+    }
+
+    ////Test large arguments
+    d_z_large        = ViewType("d_z_large", 6);
+    d_cbi0_large     = ViewType("d_cbi0_large", 6);
+    h_z_large        = Kokkos::create_mirror_view(d_z_large);
+    h_cbi0_large     = Kokkos::create_mirror_view(d_cbi0_large);
+    h_ref_cbi0_large = HostViewType("h_ref_cbi0_large", 2);
+
+    h_z_large(0) = Kokkos::complex<double>(100.0, 10.0);
+    h_z_large(1) = Kokkos::complex<double>(100.0, 10.0);
+    h_z_large(2) = Kokkos::complex<double>(100.0, 10.0);
+    h_z_large(3) = Kokkos::complex<double>(-100.0, 10.0);
+    h_z_large(4) = Kokkos::complex<double>(-100.0, 10.0);
+    h_z_large(5) = Kokkos::complex<double>(-100.0, 10.0);
+
+    Kokkos::deep_copy(d_z_large, h_z_large);
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, TestLargeArgTag>(0, 1),
+                         *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbi0_large, d_cbi0_large);
+
+    h_ref_cbi0_large(0) =
+        Kokkos::complex<double>(-9.266819049505678e+41, -5.370779383266049e+41);
+    h_ref_cbi0_large(1) =
+        Kokkos::complex<double>(-9.266819049505678e+41, +5.370779383266049e+41);
+
+    EXPECT_TRUE(Kokkos::abs(h_cbi0_large(0) - h_ref_cbi0_large(0)) <
+                Kokkos::abs(h_ref_cbi0_large(0)) * 1e-15);
+    EXPECT_TRUE(Kokkos::abs(h_cbi0_large(1) - h_ref_cbi0_large(0)) >
+                Kokkos::abs(h_ref_cbi0_large(0)) * 1e-4);
+    EXPECT_TRUE(Kokkos::abs(h_cbi0_large(2) - h_ref_cbi0_large(0)) <
+                Kokkos::abs(h_ref_cbi0_large(0)) * 1e-15);
+    EXPECT_TRUE(Kokkos::abs(h_cbi0_large(3) - h_ref_cbi0_large(1)) <
+                Kokkos::abs(h_ref_cbi0_large(1)) * 1e-15);
+    EXPECT_TRUE(Kokkos::abs(h_cbi0_large(4) - h_ref_cbi0_large(1)) >
+                Kokkos::abs(h_ref_cbi0_large(1)) * 1e-4);
+    EXPECT_TRUE(Kokkos::abs(h_cbi0_large(5) - h_ref_cbi0_large(1)) <
+                Kokkos::abs(h_ref_cbi0_large(1)) * 1e-15);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_cbi0(i) = Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+    d_cbk0(i) = Kokkos::Experimental::cyl_bessel_k0<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TestLargeArgTag&, const int& /*i*/) const {
+    d_cbi0_large(0) =
+        Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(0));
+    d_cbi0_large(1) =
+        Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(1), 110, 35);
+    d_cbi0_large(2) =
+        Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(2), 110, 190);
+    d_cbi0_large(3) =
+        Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(3));
+    d_cbi0_large(4) =
+        Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(4), 110, 35);
+    d_cbi0_large(5) =
+        Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(5), 110, 190);
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexBesselI1K1Function {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_cbi1, d_cbk1;
+  typename ViewType::HostMirror h_z, h_cbi1, h_cbk1;
+  HostViewType h_ref_cbi1, h_ref_cbk1;
+
+  ViewType d_z_large, d_cbi1_large, d_cbk1_large;
+  typename ViewType::HostMirror h_z_large, h_cbi1_large, h_cbk1_large;
+  HostViewType h_ref_cbi1_large, h_ref_cbk1_large;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    int N      = 25;
+    d_z        = ViewType("d_z", N);
+    d_cbi1     = ViewType("d_cbi1", N);
+    d_cbk1     = ViewType("d_cbk1", N);
+    h_z        = Kokkos::create_mirror_view(d_z);
+    h_cbi1     = Kokkos::create_mirror_view(d_cbi1);
+    h_cbk1     = Kokkos::create_mirror_view(d_cbk1);
+    h_ref_cbi1 = HostViewType("h_ref_cbi1", N);
+    h_ref_cbk1 = HostViewType("h_ref_cbk1", N);
+
+    // Generate test inputs
+    h_z(0)  = Kokkos::complex<double>(0.0, 0.0);
+    h_z(1)  = Kokkos::complex<double>(3.0, 2.0);
+    h_z(2)  = Kokkos::complex<double>(3.0, -2.0);
+    h_z(3)  = Kokkos::complex<double>(-3.0, 2.0);
+    h_z(4)  = Kokkos::complex<double>(-3.0, -2.0);
+    h_z(5)  = Kokkos::complex<double>(23.0, 10.0);
+    h_z(6)  = Kokkos::complex<double>(23.0, -10.0);
+    h_z(7)  = Kokkos::complex<double>(-23.0, 10.0);
+    h_z(8)  = Kokkos::complex<double>(-23.0, -10.0);
+    h_z(9)  = Kokkos::complex<double>(3.0, 0.0);
+    h_z(10) = Kokkos::complex<double>(-3.0, 0.0);
+    h_z(11) = Kokkos::complex<double>(23.0, 0.0);
+    h_z(12) = Kokkos::complex<double>(-23.0, 0.0);
+    h_z(13) = Kokkos::complex<double>(28.0, 10.0);
+    h_z(14) = Kokkos::complex<double>(28.0, -10.0);
+    h_z(15) = Kokkos::complex<double>(-28.0, 10.0);
+    h_z(16) = Kokkos::complex<double>(-28.0, -10.0);
+    h_z(17) = Kokkos::complex<double>(60.0, 10.0);
+    h_z(18) = Kokkos::complex<double>(60.0, -10.0);
+    h_z(19) = Kokkos::complex<double>(-60.0, 10.0);
+    h_z(20) = Kokkos::complex<double>(-60.0, -10.0);
+    h_z(21) = Kokkos::complex<double>(28.0, 0.0);
+    h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
+    h_z(23) = Kokkos::complex<double>(60.0, 0.0);
+    h_z(24) = Kokkos::complex<double>(-60.0, 0.0);
+
+    Kokkos::deep_copy(d_z, h_z);
+
+    // Call Bessel functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbi1, d_cbi1);
+    Kokkos::deep_copy(h_cbk1, d_cbk1);
+
+    // Reference values computed with Octave
+    h_ref_cbi1(0) = Kokkos::complex<double>(0, 0);
+    h_ref_cbi1(1) =
+        Kokkos::complex<double>(-8.127809410735776e-01, +3.780682961371298e+00);
+    h_ref_cbi1(2) =
+        Kokkos::complex<double>(-8.127809410735776e-01, -3.780682961371298e+00);
+    h_ref_cbi1(3) =
+        Kokkos::complex<double>(8.127809410735776e-01, +3.780682961371298e+00);
+    h_ref_cbi1(4) =
+        Kokkos::complex<double>(8.127809410735776e-01, -3.780682961371298e+00);
+    h_ref_cbi1(5) =
+        Kokkos::complex<double>(-7.119745937677552e+08, -2.813616375214342e+08);
+    h_ref_cbi1(6) =
+        Kokkos::complex<double>(-7.119745937677552e+08, +2.813616375214342e+08);
+    h_ref_cbi1(7) =
+        Kokkos::complex<double>(7.119745937677552e+08, -2.813616375214342e+08);
+    h_ref_cbi1(8) =
+        Kokkos::complex<double>(7.119745937677552e+08, +2.813616375214342e+08);
+    h_ref_cbi1(9)  = Kokkos::complex<double>(3.953370217402609e+00, 0);
+    h_ref_cbi1(10) = Kokkos::complex<double>(-3.953370217402609e+00, 0);
+    h_ref_cbi1(11) = Kokkos::complex<double>(7.972200260896506e+08, 0);
+    h_ref_cbi1(12) = Kokkos::complex<double>(-7.972200260896506e+08, 0);
+    h_ref_cbi1(13) =
+        Kokkos::complex<double>(-9.596150723281404e+10, -4.149038020045121e+10);
+    h_ref_cbi1(14) =
+        Kokkos::complex<double>(-9.596150723281404e+10, +4.149038020045121e+10);
+    h_ref_cbi1(15) =
+        Kokkos::complex<double>(9.596150723281404e+10, -4.149038020045121e+10);
+    h_ref_cbi1(16) =
+        Kokkos::complex<double>(9.596150723281404e+10, +4.149038020045121e+10);
+    h_ref_cbi1(17) =
+        Kokkos::complex<double>(-5.112615594220387e+24, -2.751210232069100e+24);
+    h_ref_cbi1(18) =
+        Kokkos::complex<double>(-5.112615594220387e+24, +2.751210232069100e+24);
+    h_ref_cbi1(19) =
+        Kokkos::complex<double>(5.112615594220387e+24, -2.751210232069100e+24);
+    h_ref_cbi1(20) =
+        Kokkos::complex<double>(5.112615594220387e+24, +2.751210232069100e+24);
+    h_ref_cbi1(21) = Kokkos::complex<double>(1.075605042080823e+11, 0);
+    h_ref_cbi1(22) = Kokkos::complex<double>(-1.075605042080823e+11, 0);
+    h_ref_cbi1(23) = Kokkos::complex<double>(5.844751588390470e+24, 0);
+    h_ref_cbi1(24) = Kokkos::complex<double>(-5.844751588390470e+24, 0);
+
+    h_ref_cbk1(0) = Kokkos::complex<double>(infinity<double>::value, 0);
+    h_ref_cbk1(1) =
+        Kokkos::complex<double>(-2.480952007015153e-02, -2.557074905635180e-02);
+    h_ref_cbk1(2) =
+        Kokkos::complex<double>(-2.480952007015153e-02, +2.557074905635180e-02);
+    h_ref_cbk1(3) =
+        Kokkos::complex<double>(-1.185255629692602e+01, +2.527855884398198e+00);
+    h_ref_cbk1(4) =
+        Kokkos::complex<double>(-1.185255629692602e+01, -2.527855884398198e+00);
+    h_ref_cbk1(5) =
+        Kokkos::complex<double>(-1.839497240093994e-11, +1.841855854336314e-11);
+    h_ref_cbk1(6) =
+        Kokkos::complex<double>(-1.839497240093994e-11, -1.841855854336314e-11);
+    h_ref_cbk1(7) =
+        Kokkos::complex<double>(8.839236534393319e+08, +2.236734153323357e+09);
+    h_ref_cbk1(8) =
+        Kokkos::complex<double>(8.839236534393319e+08, -2.236734153323357e+09);
+    h_ref_cbk1(9) = Kokkos::complex<double>(4.015643112819419e-02, 0);
+    h_ref_cbk1(10) =
+        Kokkos::complex<double>(-4.015643112819419e-02, -1.241987883191272e+01);
+    h_ref_cbk1(11) = Kokkos::complex<double>(2.724930589574976e-11, 0);
+    h_ref_cbk1(12) =
+        Kokkos::complex<double>(-2.724930589574976e-11, -2.504540577257910e+09);
+    h_ref_cbk1(13) =
+        Kokkos::complex<double>(-1.175637676331817e-13, +1.097080943197297e-13);
+    h_ref_cbk1(14) =
+        Kokkos::complex<double>(-1.175637676331817e-13, -1.097080943197297e-13);
+    h_ref_cbk1(15) =
+        Kokkos::complex<double>(1.303458736323849e+11, +3.014719661500124e+11);
+    h_ref_cbk1(16) =
+        Kokkos::complex<double>(1.303458736323849e+11, -3.014719661500124e+11);
+    h_ref_cbk1(17) =
+        Kokkos::complex<double>(-1.119411861396158e-27, +8.666195226392352e-28);
+    h_ref_cbk1(18) =
+        Kokkos::complex<double>(-1.119411861396158e-27, -8.666195226392352e-28);
+    h_ref_cbk1(19) =
+        Kokkos::complex<double>(8.643181853549355e+24, +1.606175559143138e+25);
+    h_ref_cbk1(20) =
+        Kokkos::complex<double>(8.643181853549355e+24, -1.606175559143138e+25);
+    h_ref_cbk1(21) = Kokkos::complex<double>(1.659400107332009e-13, 0);
+    h_ref_cbk1(22) =
+        Kokkos::complex<double>(-1.659400107332009e-13, -3.379112898365253e+11);
+    h_ref_cbk1(23) = Kokkos::complex<double>(1.425632026517104e-27, 0);
+    h_ref_cbk1(24) =
+        Kokkos::complex<double>(-1.425632026517104e-27, -1.836182865214478e+25);
+
+    for (int i = 0; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cbi1(i) - h_ref_cbi1(i)),
+                Kokkos::abs(h_ref_cbi1(i)) * 1e-13);
+    }
+
+    EXPECT_EQ(h_ref_cbk1(0), h_cbk1(0));
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cbk1(i) - h_ref_cbk1(i)),
+                Kokkos::abs(h_ref_cbk1(i)) * 1e-13);
+    }
+
+    ////Test large arguments
+    d_z_large        = ViewType("d_z_large", 6);
+    d_cbi1_large     = ViewType("d_cbi1_large", 6);
+    h_z_large        = Kokkos::create_mirror_view(d_z_large);
+    h_cbi1_large     = Kokkos::create_mirror_view(d_cbi1_large);
+    h_ref_cbi1_large = HostViewType("h_ref_cbi1_large", 2);
+
+    h_z_large(0) = Kokkos::complex<double>(100.0, 10.0);
+    h_z_large(1) = Kokkos::complex<double>(100.0, 10.0);
+    h_z_large(2) = Kokkos::complex<double>(100.0, 10.0);
+    h_z_large(3) = Kokkos::complex<double>(-100.0, 10.0);
+    h_z_large(4) = Kokkos::complex<double>(-100.0, 10.0);
+    h_z_large(5) = Kokkos::complex<double>(-100.0, 10.0);
+
+    Kokkos::deep_copy(d_z_large, h_z_large);
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, TestLargeArgTag>(0, 1),
+                         *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbi1_large, d_cbi1_large);
+
+    h_ref_cbi1_large(0) =
+        Kokkos::complex<double>(-9.218158020154234e+41, -5.348736158968607e+41);
+    h_ref_cbi1_large(1) =
+        Kokkos::complex<double>(9.218158020154234e+41, -5.348736158968607e+41);
+
+    EXPECT_TRUE(Kokkos::abs(h_cbi1_large(0) - h_ref_cbi1_large(0)) <
+                Kokkos::abs(h_ref_cbi1_large(0)) * 1e-15);
+    EXPECT_TRUE(Kokkos::abs(h_cbi1_large(1) - h_ref_cbi1_large(0)) >
+                Kokkos::abs(h_ref_cbi1_large(0)) * 1e-4);
+    EXPECT_TRUE(Kokkos::abs(h_cbi1_large(2) - h_ref_cbi1_large(0)) <
+                Kokkos::abs(h_ref_cbi1_large(0)) * 1e-15);
+    EXPECT_TRUE(Kokkos::abs(h_cbi1_large(3) - h_ref_cbi1_large(1)) <
+                Kokkos::abs(h_ref_cbi1_large(1)) * 1e-15);
+    EXPECT_TRUE(Kokkos::abs(h_cbi1_large(4) - h_ref_cbi1_large(1)) >
+                Kokkos::abs(h_ref_cbi1_large(1)) * 1e-4);
+    EXPECT_TRUE(Kokkos::abs(h_cbi1_large(5) - h_ref_cbi1_large(1)) <
+                Kokkos::abs(h_ref_cbi1_large(1)) * 1e-15);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_cbi1(i) = Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+    d_cbk1(i) = Kokkos::Experimental::cyl_bessel_k1<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TestLargeArgTag&, const int& /*i*/) const {
+    d_cbi1_large(0) =
+        Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(0));
+    d_cbi1_large(1) =
+        Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(1), 110, 35);
+    d_cbi1_large(2) =
+        Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(2), 110, 190);
+    d_cbi1_large(3) =
+        Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(3));
+    d_cbi1_large(4) =
+        Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(4), 110, 35);
+    d_cbi1_large(5) =
+        Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(5), 110, 190);
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexBesselH1Function {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_ch10, d_ch11;
+  typename ViewType::HostMirror h_z, h_ch10, h_ch11;
+  HostViewType h_ref_ch10, h_ref_ch11;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    int N      = 25;
+    d_z        = ViewType("d_z", N);
+    d_ch10     = ViewType("d_ch10", N);
+    d_ch11     = ViewType("d_ch11", N);
+    h_z        = Kokkos::create_mirror_view(d_z);
+    h_ch10     = Kokkos::create_mirror_view(d_ch10);
+    h_ch11     = Kokkos::create_mirror_view(d_ch11);
+    h_ref_ch10 = HostViewType("h_ref_ch10", N);
+    h_ref_ch11 = HostViewType("h_ref_ch11", N);
+
+    // Generate test inputs
+    h_z(0)  = Kokkos::complex<double>(0.0, 0.0);
+    h_z(1)  = Kokkos::complex<double>(3.0, 2.0);
+    h_z(2)  = Kokkos::complex<double>(3.0, -2.0);
+    h_z(3)  = Kokkos::complex<double>(-3.0, 2.0);
+    h_z(4)  = Kokkos::complex<double>(-3.0, -2.0);
+    h_z(5)  = Kokkos::complex<double>(23.0, 10.0);
+    h_z(6)  = Kokkos::complex<double>(23.0, -10.0);
+    h_z(7)  = Kokkos::complex<double>(-23.0, 10.0);
+    h_z(8)  = Kokkos::complex<double>(-23.0, -10.0);
+    h_z(9)  = Kokkos::complex<double>(3.0, 0.0);
+    h_z(10) = Kokkos::complex<double>(-3.0, 0.0);
+    h_z(11) = Kokkos::complex<double>(23.0, 0.0);
+    h_z(12) = Kokkos::complex<double>(-23.0, 0.0);
+    h_z(13) = Kokkos::complex<double>(28.0, 10.0);
+    h_z(14) = Kokkos::complex<double>(28.0, -10.0);
+    h_z(15) = Kokkos::complex<double>(-28.0, 10.0);
+    h_z(16) = Kokkos::complex<double>(-28.0, -10.0);
+    h_z(17) = Kokkos::complex<double>(200.0, 60.0);
+    h_z(18) = Kokkos::complex<double>(200.0, -60.0);
+    h_z(19) = Kokkos::complex<double>(-200.0, 60.0);
+    h_z(20) = Kokkos::complex<double>(-200.0, -60.0);
+    h_z(21) = Kokkos::complex<double>(28.0, 0.0);
+    h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
+    h_z(23) = Kokkos::complex<double>(200.0, 0.0);
+    h_z(24) = Kokkos::complex<double>(-200.0, 0.0);
+
+    Kokkos::deep_copy(d_z, h_z);
+
+    // Call Hankel functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_ch10, d_ch10);
+    Kokkos::deep_copy(h_ch11, d_ch11);
+
+    // Reference values computed with Octave
+    h_ref_ch10(0) = Kokkos::complex<double>(1.0, -infinity<double>::value);
+    h_ref_ch10(1) =
+        Kokkos::complex<double>(-1.779327030399459e-02, +5.281940449715537e-02);
+    h_ref_ch10(2) =
+        Kokkos::complex<double>(-2.480676488910849e+00, +1.948786988612626e+00);
+    h_ref_ch10(3) =
+        Kokkos::complex<double>(1.779327030399459e-02, +5.281940449715537e-02);
+    h_ref_ch10(4) =
+        Kokkos::complex<double>(-2.516263029518839e+00, -1.843148179618315e+00);
+    h_ref_ch10(5) =
+        Kokkos::complex<double>(-7.217716938222564e-06, -1.002796203581228e-07);
+    h_ref_ch10(6) =
+        Kokkos::complex<double>(-3.204879955218674e+03, -1.446133490498241e+03);
+    h_ref_ch10(7) =
+        Kokkos::complex<double>(7.217716938222564e-06, -1.002796203581228e-07);
+    h_ref_ch10(8) =
+        Kokkos::complex<double>(-3.204879969654108e+03, +1.446133490297682e+03);
+    h_ref_ch10(9) =
+        Kokkos::complex<double>(-2.600519549019334e-01, +3.768500100127903e-01);
+    h_ref_ch10(10) =
+        Kokkos::complex<double>(2.600519549019334e-01, +3.768500100127903e-01);
+    h_ref_ch10(11) =
+        Kokkos::complex<double>(-1.624127813134865e-01, -3.598179027370283e-02);
+    h_ref_ch10(12) =
+        Kokkos::complex<double>(1.624127813134865e-01, -3.598179027370283e-02);
+    h_ref_ch10(13) =
+        Kokkos::complex<double>(-2.184905481759440e-06, +6.263387166445335e-06);
+    h_ref_ch10(14) =
+        Kokkos::complex<double>(-2.025824374843011e+03, +2.512479278555672e+03);
+    h_ref_ch10(15) =
+        Kokkos::complex<double>(2.184905481759440e-06, +6.263387166445335e-06);
+    h_ref_ch10(16) =
+        Kokkos::complex<double>(-2.025824379212821e+03, -2.512479266028897e+03);
+    h_ref_ch10(17) =
+        Kokkos::complex<double>(-1.983689762743337e-28, -4.408449940359881e-28);
+    h_ref_ch10(18) =
+        Kokkos::complex<double>(-8.261945332108929e+23, -6.252486138159269e+24);
+    h_ref_ch10(19) =
+        Kokkos::complex<double>(1.983689762743337e-28, -4.408449940359881e-28);
+    h_ref_ch10(20) =
+        Kokkos::complex<double>(-8.261945332108929e+23, +6.252486138159269e+24);
+    h_ref_ch10(21) =
+        Kokkos::complex<double>(-7.315701054899959e-02, +1.318364704235323e-01);
+    h_ref_ch10(22) =
+        Kokkos::complex<double>(7.315701054899959e-02, +1.318364704235323e-01);
+    h_ref_ch10(23) =
+        Kokkos::complex<double>(-1.543743993056510e-02, -5.426577524981793e-02);
+    h_ref_ch10(24) =
+        Kokkos::complex<double>(1.543743993056510e-02, -5.426577524981793e-02);
+
+    h_ref_ch11(0) = Kokkos::complex<double>(0.0, -infinity<double>::value);
+    h_ref_ch11(1) =
+        Kokkos::complex<double>(5.506759533731469e-02, +2.486728122475093e-02);
+    h_ref_ch11(2) =
+        Kokkos::complex<double>(1.505230101821194e+00, +2.546831401702448e+00);
+    h_ref_ch11(3) =
+        Kokkos::complex<double>(5.506759533731469e-02, -2.486728122475093e-02);
+    h_ref_ch11(4) =
+        Kokkos::complex<double>(-1.615365292495823e+00, +2.497096839252946e+00);
+    h_ref_ch11(5) =
+        Kokkos::complex<double>(-2.319863729607219e-07, +7.274197719836158e-06);
+    h_ref_ch11(6) =
+        Kokkos::complex<double>(-1.493895250453947e+03, +3.153217017782819e+03);
+    h_ref_ch11(7) =
+        Kokkos::complex<double>(-2.319863729607210e-07, -7.274197719836158e-06);
+    h_ref_ch11(8) =
+        Kokkos::complex<double>(1.493895250917918e+03, +3.153217003234423e+03);
+    h_ref_ch11(9) =
+        Kokkos::complex<double>(3.390589585259364e-01, +3.246744247918000e-01);
+    h_ref_ch11(10) =
+        Kokkos::complex<double>(3.390589585259364e-01, -3.246744247918000e-01);
+    h_ref_ch11(11) =
+        Kokkos::complex<double>(-3.951932188370152e-02, +1.616692009926331e-01);
+    h_ref_ch11(12) =
+        Kokkos::complex<double>(-3.951932188370151e-02, -1.616692009926331e-01);
+    h_ref_ch11(13) =
+        Kokkos::complex<double>(6.265071091331731e-06, +2.296112637347948e-06);
+    h_ref_ch11(14) =
+        Kokkos::complex<double>(2.466294194249553e+03, +2.054604534104335e+03);
+    h_ref_ch11(15) =
+        Kokkos::complex<double>(6.265071091331731e-06, -2.296112637347947e-06);
+    h_ref_ch11(16) =
+        Kokkos::complex<double>(-2.466294206779695e+03, +2.054604529512110e+03);
+    h_ref_ch11(17) =
+        Kokkos::complex<double>(-4.416040381930448e-28, +1.974955285825768e-28);
+    h_ref_ch11(18) =
+        Kokkos::complex<double>(-6.250095237987940e+24, +8.112776606830997e+23);
+    h_ref_ch11(19) =
+        Kokkos::complex<double>(-4.416040381930448e-28, -1.974955285825769e-28);
+    h_ref_ch11(20) =
+        Kokkos::complex<double>(6.250095237987940e+24, +8.112776606831005e+23);
+    h_ref_ch11(21) =
+        Kokkos::complex<double>(1.305514883350938e-01, +7.552212658226459e-02);
+    h_ref_ch11(22) =
+        Kokkos::complex<double>(1.305514883350938e-01, -7.552212658226456e-02);
+    h_ref_ch11(23) =
+        Kokkos::complex<double>(-5.430453818237824e-02, +1.530182458038999e-02);
+    h_ref_ch11(24) =
+        Kokkos::complex<double>(-5.430453818237824e-02, -1.530182458039000e-02);
+
+    EXPECT_EQ(h_ref_ch10(0), h_ch10(0));
+    std::cout << "h_ch10(0): " << h_ch10(0)
+              << ", h_ref_ch10(0): " << h_ref_ch10(0) << std::endl;
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_ch10(i) - h_ref_ch10(i)),
+                Kokkos::abs(h_ref_ch10(i)) * 1e-13);
+      std::cout << i
+                << ", actual diff: " << Kokkos::abs(h_ch10(i) - h_ref_ch10(i))
+                << ", expected diff: " << Kokkos::abs(h_ref_ch10(i)) * 1e-13
+                << std::endl;
+    }
+
+    EXPECT_EQ(h_ref_ch11(0), h_ch11(0));
+    std::cout << "h_ch11(0): " << h_ch11(0)
+              << ", h_ref_ch11(0): " << h_ref_ch11(0) << std::endl;
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_ch11(i) - h_ref_ch11(i)),
+                Kokkos::abs(h_ref_ch11(i)) * 1e-13);
+      std::cout << i
+                << ", actual diff: " << Kokkos::abs(h_ch11(i) - h_ref_ch11(i))
+                << ", expected diff: " << Kokkos::abs(h_ref_ch11(i)) * 1e-13
+                << std::endl;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_ch10(i) = Kokkos::Experimental::cyl_bessel_h10(d_z(i));
+    d_ch11(i) = Kokkos::Experimental::cyl_bessel_h11(d_z(i));
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexBesselH2Function {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_ch20, d_ch21;
+  typename ViewType::HostMirror h_z, h_ch20, h_ch21;
+  HostViewType h_ref_ch20, h_ref_ch21;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    int N      = 25;
+    d_z        = ViewType("d_z", N);
+    d_ch20     = ViewType("d_ch20", N);
+    d_ch21     = ViewType("d_ch21", N);
+    h_z        = Kokkos::create_mirror_view(d_z);
+    h_ch20     = Kokkos::create_mirror_view(d_ch20);
+    h_ch21     = Kokkos::create_mirror_view(d_ch21);
+    h_ref_ch20 = HostViewType("h_ref_ch20", N);
+    h_ref_ch21 = HostViewType("h_ref_ch21", N);
+
+    // Generate test inputs
+    h_z(0)  = Kokkos::complex<double>(0.0, 0.0);
+    h_z(1)  = Kokkos::complex<double>(3.0, 2.0);
+    h_z(2)  = Kokkos::complex<double>(3.0, -2.0);
+    h_z(3)  = Kokkos::complex<double>(-3.0, 2.0);
+    h_z(4)  = Kokkos::complex<double>(-3.0, -2.0);
+    h_z(5)  = Kokkos::complex<double>(23.0, 10.0);
+    h_z(6)  = Kokkos::complex<double>(23.0, -10.0);
+    h_z(7)  = Kokkos::complex<double>(-23.0, 10.0);
+    h_z(8)  = Kokkos::complex<double>(-23.0, -10.0);
+    h_z(9)  = Kokkos::complex<double>(3.0, 0.0);
+    h_z(10) = Kokkos::complex<double>(-3.0, 0.0);
+    h_z(11) = Kokkos::complex<double>(23.0, 0.0);
+    h_z(12) = Kokkos::complex<double>(-23.0, 0.0);
+    h_z(13) = Kokkos::complex<double>(28.0, 10.0);
+    h_z(14) = Kokkos::complex<double>(28.0, -10.0);
+    h_z(15) = Kokkos::complex<double>(-28.0, 10.0);
+    h_z(16) = Kokkos::complex<double>(-28.0, -10.0);
+    h_z(17) = Kokkos::complex<double>(200.0, 60.0);
+    h_z(18) = Kokkos::complex<double>(200.0, -60.0);
+    h_z(19) = Kokkos::complex<double>(-200.0, 60.0);
+    h_z(20) = Kokkos::complex<double>(-200.0, -60.0);
+    h_z(21) = Kokkos::complex<double>(28.0, 0.0);
+    h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
+    h_z(23) = Kokkos::complex<double>(200.0, 0.0);
+    h_z(24) = Kokkos::complex<double>(-200.0, 0.0);
+
+    Kokkos::deep_copy(d_z, h_z);
+
+    // Call Hankel functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_ch20, d_ch20);
+    Kokkos::deep_copy(h_ch21, d_ch21);
+
+    // Reference values computed with Octave
+    h_ref_ch20(0) = Kokkos::complex<double>(1.0, infinity<double>::value);
+    h_ref_ch20(1) =
+        Kokkos::complex<double>(-2.480676488910849e+00, -1.948786988612626e+00);
+    h_ref_ch20(2) =
+        Kokkos::complex<double>(-1.779327030399459e-02, -5.281940449715537e-02);
+    h_ref_ch20(3) =
+        Kokkos::complex<double>(-2.516263029518839e+00, +1.843148179618315e+00);
+    h_ref_ch20(4) =
+        Kokkos::complex<double>(1.779327030399459e-02, -5.281940449715537e-02);
+    h_ref_ch20(5) =
+        Kokkos::complex<double>(-3.204879955218674e+03, +1.446133490498241e+03);
+    h_ref_ch20(6) =
+        Kokkos::complex<double>(-7.217716938222564e-06, +1.002796203581228e-07);
+    h_ref_ch20(7) =
+        Kokkos::complex<double>(-3.204879969654108e+03, -1.446133490297682e+03);
+    h_ref_ch20(8) =
+        Kokkos::complex<double>(7.217716938222564e-06, +1.002796203581228e-07);
+    h_ref_ch20(9) =
+        Kokkos::complex<double>(-2.600519549019334e-01, -3.768500100127903e-01);
+    h_ref_ch20(10) =
+        Kokkos::complex<double>(-7.801558647058006e-01, -3.768500100127903e-01);
+    h_ref_ch20(11) =
+        Kokkos::complex<double>(-1.624127813134865e-01, +3.598179027370283e-02);
+    h_ref_ch20(12) =
+        Kokkos::complex<double>(-4.872383439404597e-01, +3.598179027370281e-02);
+    h_ref_ch20(13) =
+        Kokkos::complex<double>(-2.025824374843011e+03, -2.512479278555672e+03);
+    h_ref_ch20(14) =
+        Kokkos::complex<double>(-2.184905481759440e-06, -6.263387166445335e-06);
+    h_ref_ch20(15) =
+        Kokkos::complex<double>(-2.025824379212821e+03, +2.512479266028897e+03);
+    h_ref_ch20(16) =
+        Kokkos::complex<double>(2.184905481759440e-06, -6.263387166445335e-06);
+    h_ref_ch20(17) =
+        Kokkos::complex<double>(-8.261945332108929e+23, +6.252486138159269e+24);
+    h_ref_ch20(18) =
+        Kokkos::complex<double>(-1.983689762743337e-28, +4.408449940359881e-28);
+    h_ref_ch20(19) =
+        Kokkos::complex<double>(-8.261945332108929e+23, -6.252486138159269e+24);
+    h_ref_ch20(20) =
+        Kokkos::complex<double>(1.983689762743337e-28, +4.408449940359881e-28);
+    h_ref_ch20(21) =
+        Kokkos::complex<double>(-7.315701054899959e-02, -1.318364704235323e-01);
+    h_ref_ch20(22) =
+        Kokkos::complex<double>(-2.194710316469988e-01, -1.318364704235323e-01);
+    h_ref_ch20(23) =
+        Kokkos::complex<double>(-1.543743993056510e-02, +5.426577524981793e-02);
+    h_ref_ch20(24) =
+        Kokkos::complex<double>(-4.631231979169528e-02, +5.426577524981793e-02);
+
+    h_ref_ch21(0) = Kokkos::complex<double>(0.0, infinity<double>::value);
+    h_ref_ch21(1) =
+        Kokkos::complex<double>(1.505230101821194e+00, -2.546831401702448e+00);
+    h_ref_ch21(2) =
+        Kokkos::complex<double>(5.506759533731469e-02, -2.486728122475093e-02);
+    h_ref_ch21(3) =
+        Kokkos::complex<double>(-1.615365292495823e+00, -2.497096839252946e+00);
+    h_ref_ch21(4) =
+        Kokkos::complex<double>(5.506759533731469e-02, +2.486728122475093e-02);
+    h_ref_ch21(5) =
+        Kokkos::complex<double>(-1.493895250453947e+03, -3.153217017782819e+03);
+    h_ref_ch21(6) =
+        Kokkos::complex<double>(-2.319863729607219e-07, -7.274197719836158e-06);
+    h_ref_ch21(7) =
+        Kokkos::complex<double>(1.493895250917918e+03, -3.153217003234423e+03);
+    h_ref_ch21(8) =
+        Kokkos::complex<double>(-2.319863729607210e-07, +7.274197719836158e-06);
+    h_ref_ch21(9) =
+        Kokkos::complex<double>(3.390589585259364e-01, -3.246744247918000e-01);
+    h_ref_ch21(10) =
+        Kokkos::complex<double>(-1.017176875577809e+00, +3.246744247918000e-01);
+    h_ref_ch21(11) =
+        Kokkos::complex<double>(-3.951932188370152e-02, -1.616692009926331e-01);
+    h_ref_ch21(12) =
+        Kokkos::complex<double>(1.185579656511045e-01, +1.616692009926332e-01);
+    h_ref_ch21(13) =
+        Kokkos::complex<double>(2.466294194249553e+03, -2.054604534104335e+03);
+    h_ref_ch21(14) =
+        Kokkos::complex<double>(6.265071091331731e-06, -2.296112637347948e-06);
+    h_ref_ch21(15) =
+        Kokkos::complex<double>(-2.466294206779695e+03, -2.054604529512110e+03);
+    h_ref_ch21(16) =
+        Kokkos::complex<double>(6.265071091331731e-06, +2.296112637347947e-06);
+    h_ref_ch21(17) =
+        Kokkos::complex<double>(-6.250095237987940e+24, -8.112776606830997e+23);
+    h_ref_ch21(18) =
+        Kokkos::complex<double>(-4.416040381930448e-28, -1.974955285825768e-28);
+    h_ref_ch21(19) =
+        Kokkos::complex<double>(6.250095237987940e+24, -8.112776606831005e+23);
+    h_ref_ch21(20) =
+        Kokkos::complex<double>(-4.416040381930448e-28, +1.974955285825769e-28);
+    h_ref_ch21(21) =
+        Kokkos::complex<double>(1.305514883350938e-01, -7.552212658226459e-02);
+    h_ref_ch21(22) =
+        Kokkos::complex<double>(-3.916544650052814e-01, +7.552212658226461e-02);
+    h_ref_ch21(23) =
+        Kokkos::complex<double>(-5.430453818237824e-02, -1.530182458038999e-02);
+    h_ref_ch21(24) =
+        Kokkos::complex<double>(1.629136145471347e-01, +1.530182458039000e-02);
+
+    EXPECT_EQ(h_ref_ch20(0), h_ch20(0));
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_ch20(i) - h_ref_ch20(i)),
+                Kokkos::abs(h_ref_ch20(i)) * 1e-13);
+    }
+
+    EXPECT_EQ(h_ref_ch21(0), h_ch21(0));
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_ch21(i) - h_ref_ch21(i)),
+                Kokkos::abs(h_ref_ch21(i)) * 1e-13);
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_ch20(i) = Kokkos::Experimental::cyl_bessel_h20(d_z(i));
+    d_ch21(i) = Kokkos::Experimental::cyl_bessel_h21(d_z(i));
+  }
+};
+
+TEST(TEST_CATEGORY, mathspecialfunc_expint1) {
+  TestExponentialIntergral1Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_errorfunc) {
+  TestComplexErrorFunction<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_cbesselj0y0) {
+  TestComplexBesselJ0Y0Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_cbesselj1y1) {
+  TestComplexBesselJ1Y1Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_cbesseli0k0) {
+  TestComplexBesselI0K0Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_cbesseli1k1) {
+  TestComplexBesselI1K1Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_cbesselh1stkind) {
+  TestComplexBesselH1Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_cbesselh2ndkind) {
+  TestComplexBesselH2Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+}  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestMemoryPool.hpp b/packages/kokkos/core/unit_test/TestMemoryPool.hpp
index 63895ad47dc435c98201a2b46d8b439d2a50ad51..829e8d641a5b00a0be67200bdf30495951e95457 100644
--- a/packages/kokkos/core/unit_test/TestMemoryPool.hpp
+++ b/packages/kokkos/core/unit_test/TestMemoryPool.hpp
@@ -50,7 +50,7 @@
 #include <cmath>
 #include <algorithm>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 namespace TestMemoryPool {
 
diff --git a/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp b/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp
index 6c8a47a5861dd361364a94551abcfd50d0e85153..d7607c4f71b0df4f32af77f4441c8c909992b14a 100644
--- a/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp
+++ b/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp
@@ -48,7 +48,7 @@
 
 #include <Kokkos_Core.hpp>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <iostream>
 #include <cstdlib>
 #include <cstdint>
@@ -310,6 +310,46 @@ struct array_reduce {
     return lsum;
   }
 };
+
+struct point_t {
+  uint8_t x, y, z;
+
+  KOKKOS_FUNCTION
+  point_t() : x(1), y(1), z(1){};
+
+  KOKKOS_FUNCTION
+  point_t(const point_t &val) : x(val.x), y(val.y), z(val.z){};
+
+  KOKKOS_FUNCTION
+  point_t(const volatile point_t &val) : x(val.x), y(val.y), z(val.z){};
+
+  KOKKOS_FUNCTION
+  point_t(const int rhs) { x = y = z = static_cast<uint8_t>(rhs); }
+
+  KOKKOS_FUNCTION
+  explicit operator int() const { return static_cast<int>(x + y + z); }
+
+  KOKKOS_FUNCTION
+  bool operator==(const volatile point_t rhs) const volatile {
+    return (x == rhs.x && y == rhs.y && z == rhs.z);
+  }
+
+  KOKKOS_FUNCTION
+  void operator=(point_t rhs) volatile {
+    x = rhs.x;
+    y = rhs.y;
+    z = rhs.z;
+  }
+
+  KOKKOS_FUNCTION
+  volatile point_t operator+=(const volatile point_t rhs) volatile {
+    x += rhs.x;
+    y += rhs.y;
+    z += rhs.z;
+    return *this;
+  }
+};
+
 }  // namespace Test
 
 namespace Kokkos {
@@ -334,5 +374,21 @@ struct reduction_identity<Test::array_reduce<scalar_t, N>> {
     return Test::array_reduce<scalar_t, N>(t_red_ident::prod());
   }
 };
+
+template <>
+struct reduction_identity<Test::point_t> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static uint8_t sum() noexcept {
+    return 0;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static uint8_t prod() noexcept {
+    return 1;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static uint8_t max() noexcept {
+    return 0xff;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static uint8_t min() noexcept {
+    return 0x0;
+  }
+};
 }  // namespace Kokkos
 #endif  // TESTNONTRIVIALSCALARTYPES_HPP_
diff --git a/packages/kokkos/core/unit_test/TestNumericTraits.hpp b/packages/kokkos/core/unit_test/TestNumericTraits.hpp
index fe01b83834f26eddc15e71360d77e85452ef0238..cb69cb83211e7b82f941e544b0498da6df737cf4 100644
--- a/packages/kokkos/core/unit_test/TestNumericTraits.hpp
+++ b/packages/kokkos/core/unit_test/TestNumericTraits.hpp
@@ -46,6 +46,7 @@
 
 #include <Kokkos_Core.hpp>
 #include <type_traits>
+#include <limits>
 #include "Kokkos_NumericTraits.hpp"
 #include "Kokkos_ExecPolicy.hpp"
 
@@ -198,7 +199,9 @@ struct TestNumericTraits<
 TEST(TEST_CATEGORY, numeric_traits_infinity) {
   TestNumericTraits<TEST_EXECSPACE, float, Infinity>();
   TestNumericTraits<TEST_EXECSPACE, double, Infinity>();
+#ifndef KOKKOS_COMPILER_IBM  // fails with XL 16.1.1 see issue #4100
   TestNumericTraits<TEST_EXECSPACE, long double, Infinity>();
+#endif
 }
 
 TEST(TEST_CATEGORY, numeric_traits_epsilon) {
@@ -334,3 +337,182 @@ TEST(TEST_CATEGORY, numeric_traits_min_max_exponent10) {
   TestNumericTraits<TEST_EXECSPACE, long double, MinExponent10>();
   TestNumericTraits<TEST_EXECSPACE, long double, MaxExponent10>();
 }
+
+namespace NumericTraitsSFINAE {
+
+struct HasNoSpecialization {};
+
+#define CHECK_TRAIT_IS_SFINAE_FRIENDLY(TRAIT)                              \
+  template <class T>                                                       \
+  using TRAIT##_value_t = decltype(Kokkos::Experimental::TRAIT<T>::value); \
+  template <class T>                                                       \
+  using has_##TRAIT = Kokkos::is_detected<TRAIT##_value_t, T>;             \
+  static_assert(!has_##TRAIT<HasNoSpecialization>::value, "");
+
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(infinity)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(finite_min)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(finite_max)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(epsilon)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(round_error)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(norm_min)
+
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(digits)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(digits10)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(max_digits10)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(radix)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(min_exponent)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(min_exponent10)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(max_exponent)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(max_exponent10)
+
+}  // namespace NumericTraitsSFINAE
+
+// Example detecting presence or absence of values
+template <class T>
+using infinity_value_t = decltype(Kokkos::Experimental::infinity<T>::value);
+
+template <class T>
+using has_infinity = Kokkos::is_detected<infinity_value_t, T>;
+
+template <class T, std::enable_if_t<has_infinity<T>::value>* = nullptr>
+constexpr T legacy_std_numeric_limits_infinity() {
+  return Kokkos::Experimental::infinity<T>::value;
+}
+
+template <class T, std::enable_if_t<!has_infinity<T>::value>* = nullptr>
+constexpr T legacy_std_numeric_limits_infinity() {
+  return T();
+}
+
+TEST(TEST_CATEGORY, numeric_traits_sfinae_friendly) {
+  ASSERT_EQ(legacy_std_numeric_limits_infinity<int>(), 0);
+}
+
+// Compare to std::numeric_limits
+template <int V1, int V2>
+struct AssertIntEquality {
+  static constexpr bool value = false;
+};
+template <int V>
+struct AssertIntEquality<V, V> {
+  static constexpr bool value = true;
+};
+#define CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(T, TRAIT)           \
+  static_assert(AssertIntEquality<Kokkos::Experimental::TRAIT<T>::value, \
+                                  std::numeric_limits<T>::TRAIT>::value, \
+                "")
+#define CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(T, TRAIT) \
+  static_assert(Kokkos::Experimental::TRAIT<T>::value ==       \
+                    std::numeric_limits<T>::TRAIT(),           \
+                "")
+
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(float, infinity);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(double, infinity);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, infinity);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(float, epsilon);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(double, epsilon);
+#ifndef KOKKOS_COMPILER_IBM  // fails with XL 16.1.1
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, epsilon);
+#endif
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(float, round_error);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(double, round_error);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, round_error);
+// clang-format off
+static_assert(Kokkos::Experimental::norm_min<float      >::value == std::numeric_limits<      float>::min(), "");
+static_assert(Kokkos::Experimental::norm_min<double     >::value == std::numeric_limits<     double>::min(), "");
+static_assert(Kokkos::Experimental::norm_min<long double>::value == std::numeric_limits<long double>::min(), "");
+// integer types
+static_assert(Kokkos::Experimental::finite_min<char                  >::value == std::numeric_limits<                  char>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<signed char           >::value == std::numeric_limits<           signed char>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<unsigned char         >::value == std::numeric_limits<         unsigned char>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<short                 >::value == std::numeric_limits<                 short>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<unsigned short        >::value == std::numeric_limits<        unsigned short>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<int                   >::value == std::numeric_limits<                   int>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<unsigned int          >::value == std::numeric_limits<          unsigned int>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<long int              >::value == std::numeric_limits<              long int>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<unsigned long int     >::value == std::numeric_limits<     unsigned long int>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<long long int         >::value == std::numeric_limits<         long long int>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<unsigned long long int>::value == std::numeric_limits<unsigned long long int>::min(), "");
+static_assert(Kokkos::Experimental::finite_max<char                  >::value == std::numeric_limits<                  char>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<signed char           >::value == std::numeric_limits<           signed char>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<unsigned char         >::value == std::numeric_limits<         unsigned char>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<short                 >::value == std::numeric_limits<                 short>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<unsigned short        >::value == std::numeric_limits<        unsigned short>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<int                   >::value == std::numeric_limits<                   int>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<unsigned int          >::value == std::numeric_limits<          unsigned int>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<long int              >::value == std::numeric_limits<              long int>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<unsigned long int     >::value == std::numeric_limits<     unsigned long int>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<long long int         >::value == std::numeric_limits<         long long int>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<unsigned long long int>::value == std::numeric_limits<unsigned long long int>::max(), "");
+// floating point types
+static_assert(Kokkos::Experimental::finite_min<float      >::value == -std::numeric_limits<      float>::max(), "");
+static_assert(Kokkos::Experimental::finite_min<double     >::value == -std::numeric_limits<     double>::max(), "");
+static_assert(Kokkos::Experimental::finite_min<long double>::value == -std::numeric_limits<long double>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<float      >::value ==  std::numeric_limits<      float>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<double     >::value ==  std::numeric_limits<     double>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<long double>::value ==  std::numeric_limits<long double>::max(), "");
+// clang-format on
+
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(bool, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(char, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(signed char, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned char, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(short, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned short, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(int, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned int, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long int, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned long int, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long long int, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned long long int, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(bool, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(char, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(signed char, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned char, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(short, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned short, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(int, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned int, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long int, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned long int, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long long int, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned long long int, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, max_digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, max_digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, max_digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(bool, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(char, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(signed char, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned char, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(short, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned short, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(int, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned int, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long int, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned long int, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long long int, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned long long int, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, min_exponent);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, max_exponent);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, min_exponent);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, max_exponent);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, min_exponent);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, max_exponent);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, min_exponent10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, max_exponent10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, min_exponent10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, max_exponent10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, min_exponent10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, max_exponent10);
+
+#undef CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION
+#undef CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT
diff --git a/packages/kokkos/core/unit_test/TestPolicyConstruction.hpp b/packages/kokkos/core/unit_test/TestPolicyConstruction.hpp
index 0017c690e75c6e1bde1808e87203d8dbbea754cc..d75d78b31f08f3d6234a174053630abace0d781a 100644
--- a/packages/kokkos/core/unit_test/TestPolicyConstruction.hpp
+++ b/packages/kokkos/core/unit_test/TestPolicyConstruction.hpp
@@ -291,34 +291,34 @@ class TestRangePolicyConstruction {
     using policy_t = Kokkos::RangePolicy<>;
     {
       policy_t p(5, 15);
-      ASSERT_TRUE((p.begin() == 5));
-      ASSERT_TRUE((p.end() == 15));
+      ASSERT_EQ(p.begin(), 5);
+      ASSERT_EQ(p.end(), 15);
     }
     {
       policy_t p(Kokkos::DefaultExecutionSpace(), 5, 15);
-      ASSERT_TRUE((p.begin() == 5));
-      ASSERT_TRUE((p.end() == 15));
+      ASSERT_EQ(p.begin(), 5);
+      ASSERT_EQ(p.end(), 15);
     }
     {
       policy_t p(5, 15, Kokkos::ChunkSize(10));
-      ASSERT_TRUE((p.begin() == 5));
-      ASSERT_TRUE((p.end() == 15));
-      ASSERT_TRUE((p.chunk_size() == 10));
+      ASSERT_EQ(p.begin(), 5);
+      ASSERT_EQ(p.end(), 15);
+      ASSERT_EQ(p.chunk_size(), 10);
     }
     {
       policy_t p(Kokkos::DefaultExecutionSpace(), 5, 15, Kokkos::ChunkSize(10));
-      ASSERT_TRUE((p.begin() == 5));
-      ASSERT_TRUE((p.end() == 15));
-      ASSERT_TRUE((p.chunk_size() == 10));
+      ASSERT_EQ(p.begin(), 5);
+      ASSERT_EQ(p.end(), 15);
+      ASSERT_EQ(p.chunk_size(), 10);
     }
     {
       policy_t p;
-      ASSERT_TRUE((p.begin() == 0));
-      ASSERT_TRUE((p.end() == 0));
+      ASSERT_EQ(p.begin(), 0);
+      ASSERT_EQ(p.end(), 0);
       p = policy_t(5, 15, Kokkos::ChunkSize(10));
-      ASSERT_TRUE((p.begin() == 5));
-      ASSERT_TRUE((p.end() == 15));
-      ASSERT_TRUE((p.chunk_size() == 10));
+      ASSERT_EQ(p.begin(), 5);
+      ASSERT_EQ(p.end(), 15);
+      ASSERT_EQ(p.chunk_size(), 10);
     }
   }
 };
@@ -582,7 +582,7 @@ class TestTeamPolicyConstruction {
     ASSERT_EQ(p1.team_size(), team_size);
 // FIXME_SYCL implement chunk_size
 #ifndef KOKKOS_ENABLE_SYCL
-    ASSERT_TRUE(p1.chunk_size() > 0);
+    ASSERT_GT(p1.chunk_size(), 0);
 #endif
     ASSERT_EQ(p1.scratch_size(0), 0);
 
@@ -795,7 +795,7 @@ TEST(TEST_CATEGORY, desired_occupancy_empty_base_optimization) {
   static_assert(sizeof(decltype(policy)) == 1, "");
   static_assert_dummy_policy_must_be_size_one<sizeof(decltype(policy))>
       _assert1{};
-  (void)_assert1;  // avoid unused variable warning
+  (void)&_assert1;  // avoid unused variable warning
 
   using Kokkos::Experimental::DesiredOccupancy;
   auto policy_with_occ =
@@ -805,7 +805,7 @@ TEST(TEST_CATEGORY, desired_occupancy_empty_base_optimization) {
   static_assert_dummy_policy_must_be_size_of_desired_occupancy<
       sizeof(decltype(policy_with_occ)), sizeof(DesiredOccupancy)>
       _assert2{};
-  (void)_assert2;  // avoid unused variable warning
+  (void)&_assert2;  // avoid unused variable warning
 }
 
 template <typename Policy>
diff --git a/packages/kokkos/core/unit_test/TestQuadPrecisionMath.hpp b/packages/kokkos/core/unit_test/TestQuadPrecisionMath.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e45d84e7e05b5beaed658fff20201968fd0d1050
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestQuadPrecisionMath.hpp
@@ -0,0 +1,109 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_LIBQUADMATH
+
+#include <impl/Kokkos_QuadPrecisionMath.hpp>
+#include <Kokkos_Core.hpp>
+
+#include <gtest/gtest.h>
+
+// FIXME instantiate only once for default host execution space
+TEST(TEST_CATEGORY, quad_precision_reductions) {
+  int const n = 100;
+  __float128 r;
+
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, n),
+      KOKKOS_LAMBDA(int i, __float128 &v) { v += static_cast<__float128>(i); },
+      r);
+  EXPECT_EQ(r, n * (n - 1) / 2);
+
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, n),
+      KOKKOS_LAMBDA(int i, __float128 &v) { v += static_cast<__float128>(i); },
+      Kokkos::Sum<__float128>(r));
+  EXPECT_EQ(r, n * (n - 1) / 2);
+
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, n),
+      KOKKOS_LAMBDA(int i, __float128 &v) {
+        if (v > static_cast<__float128>(i)) {
+          v = static_cast<__float128>(i);
+        }
+      },
+      Kokkos::Min<__float128>(r));
+  EXPECT_EQ(r, 0);
+
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, n),
+      KOKKOS_LAMBDA(int i, __float128 &v) {
+        if (v < static_cast<__float128>(i)) {
+          v = static_cast<__float128>(i);
+        }
+      },
+      Kokkos::Max<__float128>(r));
+  EXPECT_EQ(r, n - 1);
+
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(1, n),
+      KOKKOS_LAMBDA(int i, __float128 &v) { v *= static_cast<__float128>(i); },
+      Kokkos::Prod<__float128>(r));
+  EXPECT_FLOAT_EQ(r, tgammaq(n + 1));  // factorial(n) = tgamma(n+1)
+}
+
+TEST(TEST_CATEGORY, quad_precision_common_math_functions) {
+  Kokkos::parallel_for(
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, 1),
+      KOKKOS_LAMBDA(int) {
+        (void)Kokkos::Experimental::fabs((__float128)0);
+        (void)Kokkos::Experimental::sqrt((__float128)1);
+        (void)Kokkos::Experimental::exp((__float128)2);
+        (void)Kokkos::Experimental::sin((__float128)3);
+        (void)Kokkos::Experimental::cosh((__float128)4);
+      });
+}
+
+#endif
diff --git a/packages/kokkos/core/unit_test/TestRange.hpp b/packages/kokkos/core/unit_test/TestRange.hpp
index a6a6220f2dceea470414fb0d712796689f6d151c..d6b5d8fecc86173d3fa438cbca5b8242b48ddb36 100644
--- a/packages/kokkos/core/unit_test/TestRange.hpp
+++ b/packages/kokkos/core/unit_test/TestRange.hpp
@@ -317,10 +317,10 @@ struct TestRange {
           if (count(t) < min) min = count(t);
           if (count(t) > max) max = count(t);
         }
-        ASSERT_TRUE(min < max);
+        ASSERT_LT(min, max);
 
         // if ( ExecSpace::concurrency() > 2 ) {
-        //  ASSERT_TRUE( 2 * min < max );
+        //  ASSERT_LT( 2 * min, max );
         //}
       }
     }
@@ -361,10 +361,10 @@ struct TestRange {
           if (count(t) < min) min = count(t);
           if (count(t) > max) max = count(t);
         }
-        ASSERT_TRUE(min < max);
+        ASSERT_LT(min, max);
 
         // if ( ExecSpace::concurrency() > 2 ) {
-        //  ASSERT_TRUE( 2 * min < max );
+        //  ASSERT_LT( 2 * min, max );
         //}
       }
     }
diff --git a/packages/kokkos/core/unit_test/TestRangePolicyRequire.hpp b/packages/kokkos/core/unit_test/TestRangePolicyRequire.hpp
index 693f19613db6beb8c1c2a551574808de26633726..508b7192cb29aa87d3d28930d0babe0be1024432 100644
--- a/packages/kokkos/core/unit_test/TestRangePolicyRequire.hpp
+++ b/packages/kokkos/core/unit_test/TestRangePolicyRequire.hpp
@@ -309,10 +309,10 @@ struct TestRangeRequire {
           if (count(t) < min) min = count(t);
           if (count(t) > max) max = count(t);
         }
-        ASSERT_TRUE(min < max);
+        ASSERT_LT(min, max);
 
         // if ( ExecSpace::concurrency() > 2 ) {
-        //  ASSERT_TRUE( 2 * min < max );
+        //  ASSERT_LT( 2 * min, max );
         //}
       }
     }
@@ -353,10 +353,10 @@ struct TestRangeRequire {
           if (count(t) < min) min = count(t);
           if (count(t) > max) max = count(t);
         }
-        ASSERT_TRUE(min < max);
+        ASSERT_LT(min, max);
 
         // if ( ExecSpace::concurrency() > 2 ) {
-        //  ASSERT_TRUE( 2 * min < max );
+        //  ASSERT_LT( 2 * min, max );
         //}
       }
     }
diff --git a/packages/kokkos/core/unit_test/TestReduce.hpp b/packages/kokkos/core/unit_test/TestReduce.hpp
index 5f7fbd5623d6e8e4c25c261a0f092d79c1573fba..81e063f83e3ae4fba46f525756c262cb851d2068 100644
--- a/packages/kokkos/core/unit_test/TestReduce.hpp
+++ b/packages/kokkos/core/unit_test/TestReduce.hpp
@@ -539,6 +539,10 @@ class TestReduceDynamicView {
 
 }  // namespace
 
+// FIXME_OPENMPTARGET : The feature works with LLVM/13 on NVIDIA
+// architectures. The jenkins currently tests with LLVM/12.
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CLANG) && \
+    (KOKKOS_COMPILER_CLANG >= 1300)
 TEST(TEST_CATEGORY, int64_t_reduce) {
   TestReduce<int64_t, TEST_EXECSPACE>(0);
   TestReduce<int64_t, TEST_EXECSPACE>(1000000);
@@ -563,7 +567,10 @@ TEST(TEST_CATEGORY, int64_t_reduce_dynamic_view) {
   TestReduceDynamicView<int64_t, TEST_EXECSPACE>(0);
   TestReduceDynamicView<int64_t, TEST_EXECSPACE>(1000000);
 }
+#endif
 
+// FIXME_OPENMPTARGET: Not yet implemented.
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
 TEST(TEST_CATEGORY, int_combined_reduce) {
   using functor_type = CombinedReduceFunctorSameType<int64_t, TEST_EXECSPACE>;
   constexpr uint64_t nw = 1000;
@@ -626,4 +633,5 @@ TEST(TEST_CATEGORY, int_combined_reduce_mixed) {
   ASSERT_EQ(nsum, result2);
   ASSERT_EQ(nsum, result3_v());
 }
+#endif
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp b/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp
index 68e7d746dd91a68046c4d074884ef5aef7519427..4664f265594b858e8879e7d2faa3aca62d320a0d 100644
--- a/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp
+++ b/packages/kokkos/core/unit_test/TestReduceCombinatorical.hpp
@@ -439,11 +439,11 @@ struct TestReduceCombinatoricalInstantiation {
                        Test::ReduceCombinatorical::AddPlus<double>(value));
     if ((Kokkos::DefaultExecutionSpace::concurrency() > 1) &&
         (ExecSpace::concurrency() > 1) && (expected_result > 0)) {
-      ASSERT_TRUE(expected_result < value);
+      ASSERT_LT(expected_result, value);
     } else if (((Kokkos::DefaultExecutionSpace::concurrency() > 1) ||
                 (ExecSpace::concurrency() > 1)) &&
                (expected_result > 0)) {
-      ASSERT_TRUE(expected_result <= value);
+      ASSERT_LE(expected_result, value);
     } else {
       ASSERT_EQ(expected_result, value);
     }
@@ -453,11 +453,11 @@ struct TestReduceCombinatoricalInstantiation {
     CallParallelReduce(args..., add);
     if ((Kokkos::DefaultExecutionSpace::concurrency() > 1) &&
         (ExecSpace::concurrency() > 1) && (expected_result > 0)) {
-      ASSERT_TRUE(expected_result < value);
+      ASSERT_LT(expected_result, value);
     } else if (((Kokkos::DefaultExecutionSpace::concurrency() > 1) ||
                 (ExecSpace::concurrency() > 1)) &&
                (expected_result > 0)) {
-      ASSERT_TRUE(expected_result <= value);
+      ASSERT_LE(expected_result, value);
     } else {
       ASSERT_EQ(expected_result, value);
     }
diff --git a/packages/kokkos/core/unit_test/TestReducers.hpp b/packages/kokkos/core/unit_test/TestReducers.hpp
index 35f0e231fd2a7b1e88bbf4be568532aa5c219e3f..0d5f7fe7ba538524e0119c950f01469c7aa48a83 100644
--- a/packages/kokkos/core/unit_test/TestReducers.hpp
+++ b/packages/kokkos/core/unit_test/TestReducers.hpp
@@ -296,7 +296,8 @@ struct TestReducers {
     Scalar reference_sum = 0;
 
     for (int i = 0; i < N; i++) {
-      h_values(i) = (Scalar)(rand() % 100);
+      int denom   = sizeof(Scalar) <= 2 ? 10 : 100;
+      h_values(i) = (Scalar)(rand() % denom);
       reference_sum += h_values(i);
     }
     Kokkos::deep_copy(values, h_values);
diff --git a/packages/kokkos/core/unit_test/TestReducers_d.hpp b/packages/kokkos/core/unit_test/TestReducers_d.hpp
index e2254a1c1fe653b22c3e6b9a9ebad50d07a9eb89..2d5802cdd4fcde24e8ac1dfe0f8d42ba9eaf396b 100644
--- a/packages/kokkos/core/unit_test/TestReducers_d.hpp
+++ b/packages/kokkos/core/unit_test/TestReducers_d.hpp
@@ -64,4 +64,49 @@ TEST(TEST_CATEGORY, reducers_struct) {
   TestReducers<array_reduce<float, 7>, TEST_EXECSPACE>::test_sum(1031);
 #endif
 }
+
+TEST(TEST_CATEGORY, reducers_half_t) {
+  using ThisTestType = Kokkos::Experimental::half_t;
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(2);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(101);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(202);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(303);
+
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(5);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(10);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(15);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(20);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(25);
+}
+
+TEST(TEST_CATEGORY, reducers_int8_t) {
+  using ThisTestType = int8_t;
+
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(1);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(2);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(3);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(4);
+
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(1);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(2);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(3);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(4);
+}
+
+#if !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_OPENMPTARGET)
+// TODO - resolve: "Kokkos_HIP_Vectorization.hpp:80:15: error: call to
+//                 implicitly-deleted default constructor of 'conv_type'
+//                   conv_type tmp_in;"
+//
+// TODO - resolve:  4: [  FAILED  ] openmptarget.reducers_point_t (1 ms)
+TEST(TEST_CATEGORY, reducers_point_t) {
+  using ThisTestType = point_t;
+
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(1);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(2);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(3);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(4);
+}
+#endif  // !KOKKOS_ENABLE_HIP && !KOKKOS_ENABLE_OPENMPTARGET
+
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestReductions.hpp b/packages/kokkos/core/unit_test/TestReductions.hpp
index 949ca7eaf30a4746a8fec355f1b62c035c83d041..1fa8a2e92e68f7c3bf34e6cc4cc96b29b73071f3 100644
--- a/packages/kokkos/core/unit_test/TestReductions.hpp
+++ b/packages/kokkos/core/unit_test/TestReductions.hpp
@@ -45,8 +45,6 @@
 #ifndef KOKKOS_TEST_REDUCTIONS_HPP
 #define KOKKOS_TEST_REDUCTIONS_HPP
 #include <Kokkos_Macros.hpp>
-#ifndef KOKKOS_ENABLE_OPENMPTARGET
 #include <TestReduce.hpp>
-#endif
 #include <TestCXX11Deduction.hpp>
 #endif
diff --git a/packages/kokkos/core/unit_test/TestReductions_DeviceView.hpp b/packages/kokkos/core/unit_test/TestReductions_DeviceView.hpp
index 17563de335e5b6a6170985e392ea8ae0de5ae8c1..6ffa11b11ca2d639bd9fd930a733d41ae7950482 100644
--- a/packages/kokkos/core/unit_test/TestReductions_DeviceView.hpp
+++ b/packages/kokkos/core/unit_test/TestReductions_DeviceView.hpp
@@ -32,11 +32,17 @@ void test_reduce_device_view(int64_t N, PolicyType policy,
   typename ExecSpace::execution_space().fence();
   double time_fence0 = timer.seconds();
   Kokkos::deep_copy(result, 0);
+
+  // We need a warm-up to get reasonable results
+  Kokkos::parallel_reduce("Test::ReduceDeviceView::TestReducer", policy,
+                          functor,
+                          Kokkos::Sum<int64_t, TEST_EXECSPACE>(result));
+  Kokkos::fence();
+
   timer.reset();
   bool is_async = time0 < time_fence0;
 
   // Test Reducer
-
   Kokkos::parallel_reduce("Test::ReduceDeviceView::TestReducer", policy,
                           functor,
                           Kokkos::Sum<int64_t, TEST_EXECSPACE>(result));
@@ -75,11 +81,11 @@ void test_reduce_device_view(int64_t N, PolicyType policy,
 
   ASSERT_EQ(N, scalar_result);
   if (is_async) {
-    ASSERT_TRUE(time1 < time_fence1);
+    ASSERT_LT(time1, time_fence1);
   }
   if (is_async) {
-    ASSERT_TRUE(time2 < time_fence2);
-    ASSERT_TRUE(time3 > time_fence3);
+    ASSERT_LT(time2, time_fence2);
+    ASSERT_GT(time3, time_fence3);
   }
 }
 
@@ -128,8 +134,6 @@ TEST(TEST_CATEGORY, reduce_device_view_mdrange_policy) {
       MDRangePolicyFunctor());
 }
 
-// FIXME_HIP
-#ifndef KOKKOS_ENABLE_HIP
 TEST(TEST_CATEGORY, reduce_device_view_team_policy) {
 // FIXME_SYCL The number of workgroups on CUDA devices can not be larger than
 // 65535
@@ -145,5 +149,4 @@ TEST(TEST_CATEGORY, reduce_device_view_team_policy) {
       TeamPolicyFunctor(1024));
 #endif
 }
-#endif
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/TestStackTrace.hpp b/packages/kokkos/core/unit_test/TestStackTrace.hpp
index 284332f3f85e87b9f8fc030084ecc78448da4e38..d34d0f92e959277e7f6a66c0718ce381cd794e61 100644
--- a/packages/kokkos/core/unit_test/TestStackTrace.hpp
+++ b/packages/kokkos/core/unit_test/TestStackTrace.hpp
@@ -73,10 +73,10 @@ void test_stacktrace(bool bTerminate, bool bCustom = true) {
 
     if (bDynamic) {
       printf("test_f1: %s \n", foutput.c_str());
-      ASSERT_TRUE(std::string::npos != foutput.find("stacktrace_test_f1"));
+      ASSERT_NE(std::string::npos, foutput.find("stacktrace_test_f1"));
       for (auto x : {"stacktrace_test_f0", "stacktrace_test_f2",
                      "stacktrace_test_f3", "stacktrace_test_f4"}) {
-        ASSERT_TRUE(std::string::npos == foutput.find(x));
+        ASSERT_EQ(std::string::npos, foutput.find(x));
       }
     }
   }
@@ -92,7 +92,7 @@ void test_stacktrace(bool bTerminate, bool bCustom = true) {
                   foutput.find("Test::stacktrace_test_f1"));
       for (auto x : {"stacktrace_test_f0", "stacktrace_test_f2",
                      "stacktrace_test_f3", "stacktrace_test_f4"}) {
-        ASSERT_TRUE(std::string::npos == foutput.find(x));
+        ASSERT_EQ(std::string::npos, foutput.find(x));
       }
     }
   }
@@ -114,7 +114,7 @@ void test_stacktrace(bool bTerminate, bool bCustom = true) {
       std::string foutput = sstream.str();
       printf("test_f3: %s \n", foutput.c_str());
       for (auto x : {"stacktrace_test_f1", "stacktrace_test_f3"}) {
-        ASSERT_TRUE(std::string::npos != foutput.find(x));
+        ASSERT_NE(std::string::npos, foutput.find(x));
       }
     }
     // TODO make sure stacktrace_test_f2/4 don't show up
@@ -129,7 +129,7 @@ void test_stacktrace(bool bTerminate, bool bCustom = true) {
       std::string foutput = sstream.str();
       printf("demangled test_f3: %s \n", foutput.c_str());
       for (auto x : {"stacktrace_test_f1", "stacktrace_test_f3"}) {
-        ASSERT_TRUE(std::string::npos != foutput.find(x));
+        ASSERT_NE(std::string::npos, foutput.find(x));
       }
     }
 
diff --git a/packages/kokkos/core/unit_test/TestTeam.hpp b/packages/kokkos/core/unit_test/TestTeam.hpp
index 97ddfd4cf58518bfa494eedf4445ba68fdb1132a..a5e3de85bbc49508a2fe3c456860da9aa0b8af57 100644
--- a/packages/kokkos/core/unit_test/TestTeam.hpp
+++ b/packages/kokkos/core/unit_test/TestTeam.hpp
@@ -137,8 +137,10 @@ struct TestTeamPolicy {
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> none_auto(
         smallest_work, smallest_work, smallest_work);
 #endif
+    (void)none_auto;
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> both_auto(
         smallest_work, Kokkos::AUTO(), Kokkos::AUTO());
+    (void)both_auto;
     // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_vector(smallest_work, 32,
@@ -147,8 +149,10 @@ struct TestTeamPolicy {
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_vector(
         smallest_work, smallest_work, Kokkos::AUTO());
 #endif
+    (void)auto_vector;
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_team(
         smallest_work, Kokkos::AUTO(), smallest_work);
+    (void)auto_team;
   }
 
   static void test_for(const size_t league_size) {
@@ -970,7 +974,11 @@ struct ClassNoShmemSizeFunction {
                 double *, ExecSpace,
                 Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(1600);
 
-    int team_size = 8;
+#ifdef KOKKOS_ENABLE_SYCL
+    int team_size = 4;
+#else
+    int team_size      = 8;
+#endif
     if (team_size > ExecSpace::concurrency())
       team_size = ExecSpace::concurrency();
     {
@@ -1115,7 +1123,11 @@ void test_team_mulit_level_scratch_test_lambda() {
       Kokkos::View<double *, ExecSpace,
                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(1600);
 
+#ifdef KOKKOS_ENABLE_SYCL
+  int team_size = 4;
+#else
   int team_size = 8;
+#endif
   if (team_size > ExecSpace::concurrency())
     team_size = ExecSpace::concurrency();
 
@@ -1400,7 +1412,7 @@ struct TestTeamBroadcast<
     // above because the functor switches it back.
     bool setValue = ((lid % ts) != tid);
 
-    teamMember.team_broadcast([&](value_type &var) { var *= 2; }, value,
+    teamMember.team_broadcast([&](value_type &var) { var += var; }, value,
                               lid % ts);
     teamMember.team_broadcast([&](bool &bVar) { bVar = !bVar; }, setValue,
                               lid % ts);
@@ -1465,7 +1477,7 @@ struct TestTeamBroadcast<
     value_type expected_result = 0;
     for (unsigned int i = 0; i < league_size; i++) {
       value_type val =
-          (value_type((i % team_size) * 3) + off) * (value_type)team_size;
+          (value_type((i % team_size) * 3) + off) * value_type(team_size);
       expected_result += val;
     }
     // For comparison purposes treat the reduction as a random walk in the
diff --git a/packages/kokkos/core/unit_test/TestTeamBasic.hpp b/packages/kokkos/core/unit_test/TestTeamBasic.hpp
index 87c010ac2a0c5701916049532a715c6a5addce15..17899f63b1f7816cff75a34ccdce0b42d0ee1b3e 100644
--- a/packages/kokkos/core/unit_test/TestTeamBasic.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamBasic.hpp
@@ -105,6 +105,75 @@ TEST(TEST_CATEGORY, team_broadcast_long) {
                     long>::test_teambroadcast(1000, 1);
 }
 
+// FIXME_OPENMPTARGET CI fails with
+// Libomptarget error: Copying data from device failed.
+// Possibly, because long_wrapper is not trivially-copyable.
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
+struct long_wrapper {
+  long value;
+
+  KOKKOS_FUNCTION
+  long_wrapper() : value(0) {}
+
+  KOKKOS_FUNCTION
+  long_wrapper(long val) : value(val) {}
+
+  KOKKOS_FUNCTION
+  friend void operator+=(long_wrapper& lhs, const long_wrapper& rhs) {
+    lhs.value += rhs.value;
+  }
+
+  KOKKOS_FUNCTION
+  friend void operator+=(volatile long_wrapper& lhs,
+                         const volatile long_wrapper& rhs) {
+    lhs.value += rhs.value;
+  }
+
+  KOKKOS_FUNCTION
+  void operator=(const long_wrapper& other) { value = other.value; }
+
+  KOKKOS_FUNCTION
+  void operator=(const volatile long_wrapper& other) volatile {
+    value = other.value;
+  }
+  KOKKOS_FUNCTION
+  operator long() const { return value; }
+};
+}  // namespace Test
+
+namespace Kokkos {
+template <>
+struct reduction_identity<Test::long_wrapper>
+    : public reduction_identity<long> {};
+}  // namespace Kokkos
+
+namespace Test {
+
+// Test for non-arithmetic type
+TEST(TEST_CATEGORY, team_broadcast_long_wrapper) {
+  static_assert(!std::is_arithmetic<long_wrapper>::value, "");
+
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
+                    long_wrapper>::test_teambroadcast(0, 1);
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
+                    long_wrapper>::test_teambroadcast(0, 1);
+
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
+                    long_wrapper>::test_teambroadcast(2, 1);
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
+                    long_wrapper>::test_teambroadcast(2, 1);
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
+                    long_wrapper>::test_teambroadcast(16, 1);
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
+                    long_wrapper>::test_teambroadcast(16, 1);
+
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
+                    long_wrapper>::test_teambroadcast(1000, 1);
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
+                    long_wrapper>::test_teambroadcast(1000, 1);
+}
+#endif
+
 TEST(TEST_CATEGORY, team_broadcast_char) {
   {
     TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
diff --git a/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp b/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp
index 3db0eafa339de221a8dad8feb3cf7b3fa62027f2..836134afe0cd4d537520b12c80dd4efaafc21f38 100644
--- a/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamReductionScan.hpp
@@ -53,14 +53,8 @@ TEST(TEST_CATEGORY, team_reduction_scan) {
   TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(0);
   TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >(10);
   TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(10);
-// FIXME_HIP
-#ifdef KOKKOS_ENABLE_HIP
-  if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value)
-#endif
-  {
-    TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >(10000);
-    TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(10000);
-  }
+  TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >(10000);
+  TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(10000);
 }
 
 TEST(TEST_CATEGORY, team_long_reduce) {
diff --git a/packages/kokkos/core/unit_test/TestTeamScratch.hpp b/packages/kokkos/core/unit_test/TestTeamScratch.hpp
index 75ca3587629ded5f5cc2dd2f3b8ef6623e8a07f7..bab937273ddee06bb55b17b4fefc567c98bac30b 100644
--- a/packages/kokkos/core/unit_test/TestTeamScratch.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamScratch.hpp
@@ -54,15 +54,8 @@ TEST(TEST_CATEGORY, team_shared_request) {
 }
 
 TEST(TEST_CATEGORY, team_scratch_request) {
-  // FIXME_HIP the parallel_reduce in this test requires a team size larger than
-  // 256. Fixed in ROCm 3.9
-#if defined(KOKKOS_ENABLE_HIP) && (HIP_VERSION < 309)
-  if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value)
-#endif
-  {
-    TestScratchTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
-    TestScratchTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
-  }
+  TestScratchTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestScratchTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
@@ -78,21 +71,14 @@ TEST(TEST_CATEGORY, scratch_align) { TestScratchAlignment<TEST_EXECSPACE>(); }
 TEST(TEST_CATEGORY, shmem_size) { TestShmemSize<TEST_EXECSPACE>(); }
 
 TEST(TEST_CATEGORY, multi_level_scratch) {
-  // FIXME_HIP the parallel_for and the parallel_reduce in this test requires a
-  // team size larger than 256. Fixed In ROCm 3.9
   // FIXME_OPENMPTARGET This unit test needs ~350KB of scratch memory for L0 and
   // L1 combined per team. Currently OpenMPTarget cannot allocate this high
   // amount of scratch memory.
 #if !defined(KOKKOS_ENABLE_OPENMPTARGET)
-#if defined(KOKKOS_ENABLE_HIP) && (HIP_VERSION < 309)
-  if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value)
-#endif
-  {
-    TestMultiLevelScratchTeam<TEST_EXECSPACE,
-                              Kokkos::Schedule<Kokkos::Static> >();
-    TestMultiLevelScratchTeam<TEST_EXECSPACE,
-                              Kokkos::Schedule<Kokkos::Dynamic> >();
-  }
+  TestMultiLevelScratchTeam<TEST_EXECSPACE,
+                            Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam<TEST_EXECSPACE,
+                            Kokkos::Schedule<Kokkos::Dynamic> >();
 #endif
 }
 
diff --git a/packages/kokkos/core/unit_test/TestTeamTeamSize.hpp b/packages/kokkos/core/unit_test/TestTeamTeamSize.hpp
index 992e80397bacb9b5dc9a0746ca2543a1792cce22..f64c5b8809a214d4e2376e43df29d7900eccd1de 100644
--- a/packages/kokkos/core/unit_test/TestTeamTeamSize.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamTeamSize.hpp
@@ -110,9 +110,9 @@ void test_team_policy_max_recommended_static_size(int scratch_size) {
   int team_size_rec_reduce = p.team_size_recommended(
       FunctorReduce<T, N, PolicyType, S>(), Kokkos::ParallelReduceTag());
 
-  ASSERT_TRUE(team_size_max_for >= team_size_rec_for);
-  ASSERT_TRUE(team_size_max_reduce >= team_size_rec_reduce);
-  ASSERT_TRUE(team_size_max_for >= team_size_max_reduce);
+  ASSERT_GE(team_size_max_for, team_size_rec_for);
+  ASSERT_GE(team_size_max_reduce, team_size_rec_reduce);
+  ASSERT_GE(team_size_max_for, team_size_max_reduce);
 
   Kokkos::parallel_for(PolicyType(10000, team_size_max_for, 4)
                            .set_scratch_size(0, Kokkos::PerTeam(scratch_size)),
@@ -122,13 +122,6 @@ void test_team_policy_max_recommended_static_size(int scratch_size) {
                        FunctorFor<T, N, PolicyType, S>());
   MyArray<T, N> val;
   double n_leagues = 10000;
-  // FIXME_HIP
-#ifdef KOKKOS_ENABLE_HIP
-  if (N == 2)
-    n_leagues = 1000;
-  else
-    n_leagues = 500;
-#endif
 
   Kokkos::parallel_reduce(
       PolicyType(n_leagues, team_size_max_reduce, 4)
diff --git a/packages/kokkos/core/unit_test/TestTeamVector.hpp b/packages/kokkos/core/unit_test/TestTeamVector.hpp
index ba11dc07a962989f2826a3d0def3649112c00da6..dbed67475615606915cfcc05959de312f9eacbfd 100644
--- a/packages/kokkos/core/unit_test/TestTeamVector.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamVector.hpp
@@ -44,7 +44,7 @@
 
 #include <Kokkos_Core.hpp>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <iostream>
 #include <cstdlib>
 #include <cstdint>
@@ -111,7 +111,7 @@ struct functor_team_for {
 
         if (test != value) {
           KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-              "FAILED team_parallel_for %i %i %f %f\n", team.league_rank(),
+              "FAILED team_parallel_for %i %i %lf %lf\n", team.league_rank(),
               team.team_rank(), static_cast<double>(test),
               static_cast<double>(value));
           flag() = 1;
@@ -321,10 +321,9 @@ struct functor_team_vector_for {
 
         if (test != value) {
           KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-              "FAILED team_vector_parallel_for %i %i %f %f\n",
+              "FAILED team_vector_parallel_for %i %i %lf %lf\n",
               team.league_rank(), team.team_rank(), static_cast<double>(test),
               static_cast<double>(value));
-
           flag() = 1;
         }
       });
@@ -372,7 +371,7 @@ struct functor_team_vector_reduce {
       if (test != value) {
         if (team.league_rank() == 0) {
           KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-              "FAILED team_vector_parallel_reduce %i %i %f %f %lu\n",
+              "FAILED team_vector_parallel_reduce %i %i %lf %lf %lu\n",
               team.league_rank(), team.team_rank(), static_cast<double>(test),
               static_cast<double>(value),
               static_cast<unsigned long>(sizeof(Scalar)));
@@ -424,7 +423,7 @@ struct functor_team_vector_reduce_reducer {
 
       if (test != value) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-            "FAILED team_vector_parallel_reduce_reducer %i %i %f %f\n",
+            "FAILED team_vector_parallel_reduce_reducer %i %i %lf %lf\n",
             team.league_rank(), team.team_rank(), static_cast<double>(test),
             static_cast<double>(value));
 
@@ -471,8 +470,9 @@ struct functor_vec_single {
 
     if (value2 != (value * Scalar(nEnd - nStart))) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-          "FAILED vector_single broadcast %i %i %f %f\n", team.league_rank(),
-          team.team_rank(), (double)value2, (double)value);
+          "FAILED vector_single broadcast %i %i %lf %lf\n", team.league_rank(),
+          team.team_rank(), static_cast<double>(value2),
+          static_cast<double>(value));
 
       flag() = 1;
     }
@@ -523,7 +523,7 @@ struct functor_vec_for {
         }
 
         if (test != value) {
-          KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED vector_par_for %i %i %f %f\n",
+          KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED vector_par_for %i %i %lf %lf\n",
                                         team.league_rank(), team.team_rank(),
                                         static_cast<double>(test),
                                         static_cast<double>(value));
@@ -560,10 +560,9 @@ struct functor_vec_red {
       for (int i = 0; i < 13; i++) test += i;
 
       if (test != value) {
-        KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED vector_par_reduce %i %i %f %f\n",
-                                      team.league_rank(), team.team_rank(),
-                                      (double)test, (double)value);
-
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+            "FAILED vector_par_reduce %i %i %lf %lf\n", team.league_rank(),
+            team.team_rank(), (double)test, (double)value);
         flag() = 1;
       }
     });
@@ -600,7 +599,7 @@ struct functor_vec_red_reducer {
 
       if (test != value) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-            "FAILED vector_par_reduce_reducer %i %i %f %f\n",
+            "FAILED vector_par_reduce_reducer %i %i %lf %lf\n",
             team.league_rank(), team.team_rank(), (double)test, (double)value);
 
         flag() = 1;
@@ -630,9 +629,10 @@ struct functor_vec_scan {
 
                               if (test != val) {
                                 KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-                                    "FAILED vector_par_scan %i %i %f %f\n",
+                                    "FAILED vector_par_scan %i %i %lf %lf\n",
                                     team.league_rank(), team.team_rank(),
-                                    (double)test, (double)val);
+                                    static_cast<double>(test),
+                                    static_cast<double>(val));
 
                                 flag() = 1;
                               }
@@ -723,7 +723,12 @@ template <class ExecutionSpace>
 bool Test(int test) {
   bool passed = true;
 
+// With SYCL 33*8 exceeds the maximum work group size
+#ifdef KOKKOS_ENABLE_SYCL
+  int team_size = 31;
+#else
   int team_size = 33;
+#endif
   if (team_size > int(ExecutionSpace::concurrency()))
     team_size = int(ExecutionSpace::concurrency());
   passed = passed && test_scalar<int, ExecutionSpace>(317, team_size, test);
@@ -856,7 +861,7 @@ template <typename ScalarType, class DeviceType>
 class TestTripleNestedReduce {
  public:
   using execution_space = DeviceType;
-  using size_type       = typename execution_space::size_type;
+  using size_type = typename execution_space::size_type;
 
   TestTripleNestedReduce(const size_type &, const size_type, const size_type &,
                          const size_type) {}
@@ -1000,17 +1005,24 @@ TEST(TEST_CATEGORY, triple_nested_parallelism) {
 // With KOKKOS_ENABLE_DEBUG enabled, the functor uses too many registers to run
 // with a team size of 32 on GPUs, 16 is the max possible (at least on a K80
 // GPU) See https://github.com/kokkos/kokkos/issues/1513
+// For Intel GPUs, the requested workgroup size is just too large here.
 #if defined(KOKKOS_ENABLE_DEBUG) && defined(KOKKOS_ENABLE_CUDA)
-  if (!std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value) {
+  if (!std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value)
+#elif defined(KOKKOS_ENABLE_SYCL)
+  if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::SYCL>::value)
 #endif
+  {
     TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 32, 32);
     TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 32, 16);
-#if defined(KOKKOS_ENABLE_DEBUG) && defined(KOKKOS_ENABLE_CUDA)
   }
+#if defined(KOKKOS_ENABLE_SYCL)
+  if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::SYCL>::value)
 #endif
+  {
+    TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 33);
+    TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 19);
+  }
   TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 16);
-  TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 33);
-  TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 19);
   TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 7, 16);
 }
 #endif
diff --git a/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp b/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp
index 7342ebad8433526719b52058ff6d6b75e41a107a..c4116b91392e2020ecf0a030f96536c3a47a6dfa 100644
--- a/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp
+++ b/packages/kokkos/core/unit_test/TestTeamVectorRange.hpp
@@ -44,7 +44,7 @@
 
 #include <Kokkos_Core.hpp>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <iostream>
 #include <cstdlib>
 #include <cstdint>
@@ -280,7 +280,7 @@ struct functor_teamvector_for {
 
         if (test != value) {
           KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-              "FAILED teamvector_parallel_for %i %i %f %f\n",
+              "FAILED teamvector_parallel_for %i %i %lf %lf\n",
               team.league_rank(), team.team_rank(), static_cast<double>(test),
               static_cast<double>(value));
           flag() = 1;
@@ -493,7 +493,12 @@ template <class ExecutionSpace>
 bool Test(int test) {
   bool passed = true;
 
+// With SYCL 33*8 exceeds the maximum work group size
+#ifdef KOKKOS_ENABLE_SYCL
+  int team_size = 31;
+#else
   int team_size = 33;
+#endif
   if (team_size > int(ExecutionSpace::concurrency()))
     team_size = int(ExecutionSpace::concurrency());
   passed = passed && test_scalar<int, ExecutionSpace>(317, team_size, test);
diff --git a/packages/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp b/packages/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
index a0bc7c4304a040a10bc182e5d23d7c9ba08c4110..a0d00ded1b1586e67eb6bc09f93cf386239c3e2d 100644
--- a/packages/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
+++ b/packages/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
@@ -138,72 +138,38 @@ struct SumInitJoinFinalValueTypeArray {
   }
 };
 
-template <class Scalar, class ExecutionSpace>
-struct SumWrongInitJoinFinalValueType {
-  using execution_space = ExecutionSpace;
-  using type            = typename Kokkos::View<Scalar*, execution_space>;
-  using value_type      = Scalar;
-
-  type view;
-
-  SumWrongInitJoinFinalValueType(type view_) : view(view_) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void init(double& val) const { val = double(); }
-
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& val, const value_type& src) const {
-    val += src;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int /*i*/, value_type& val) const { val += value_type(); }
-};
-
 template <class Scalar, class ExecutionSpace>
 void TestTemplateMetaFunctions() {
-  using type = typename Kokkos::View<Scalar*, ExecutionSpace>;
-  type a("A", 100);
-  /*
-    int sum_plain_has_init_arg = Kokkos::Impl::FunctorHasInit< SumPlain<Scalar,
-    ExecutionSpace>, Scalar & >::value; ASSERT_EQ( sum_plain_has_init_arg, 0 );
-    int sum_initjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit<
-    SumInitJoinFinalValueType<Scalar, ExecutionSpace>, Scalar >::value;
-    ASSERT_EQ( sum_initjoinfinalvaluetype_has_init_arg, 1 );
-    int sum_initjoinfinalvaluetype_has_init_arg2 = Kokkos::Impl::FunctorHasInit<
-    SumInitJoinFinalValueType2<Scalar,ExecutionSpace>, Scalar >::value;
-    ASSERT_EQ( sum_initjoinfinalvaluetype_has_init_arg2, 1 );
-    int sum_wronginitjoinfinalvaluetype_has_init_arg =
-    Kokkos::Impl::FunctorHasInit< SumWrongInitJoinFinalValueType<Scalar,
-    ExecutionSpace>, Scalar >::value; ASSERT_EQ(
-    sum_wronginitjoinfinalvaluetype_has_init_arg, 0 );
-
-    //int sum_initjoinfinalvaluetypearray_has_init_arg =
-    Kokkos::Impl::FunctorHasInit< SumInitJoinFinalValueTypeArray<Scalar,
-    ExecutionSpace>, Scalar[] >::value;
-    //ASSERT_EQ( sum_initjoinfinalvaluetypearray_has_init_arg, 1 );
-
-    //printf( "Values Init: %i %i %i\n", sum_plain_has_init_arg,
-    sum_initjoinfinalvaluetype_has_init_arg,
-    sum_wronginitjoinfinalvaluetype_has_init_arg );
-
-    int sum_plain_has_join_arg = Kokkos::Impl::FunctorHasJoin< SumPlain<Scalar,
-    ExecutionSpace>, Scalar >::value; ASSERT_EQ( sum_plain_has_join_arg, 0 );
-    int sum_initjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin<
-    SumInitJoinFinalValueType<Scalar, ExecutionSpace>, Scalar >::value;
-    ASSERT_EQ( sum_initjoinfinalvaluetype_has_join_arg, 1 );
-    int sum_initjoinfinalvaluetype_has_join_arg2 = Kokkos::Impl::FunctorHasJoin<
-    SumInitJoinFinalValueType2<Scalar, ExecutionSpace>, Scalar >::value;
-    ASSERT_EQ( sum_initjoinfinalvaluetype_has_join_arg2, 1 );
-    int sum_wronginitjoinfinalvaluetype_has_join_arg =
-    Kokkos::Impl::FunctorHasJoin< SumWrongInitJoinFinalValueType<Scalar,
-    ExecutionSpace>, Scalar >::value; ASSERT_EQ(
-    sum_wronginitjoinfinalvaluetype_has_join_arg, 0 );
-
-    //printf( "Values Join: %i %i %i\n", sum_plain_has_join_arg,
-    sum_initjoinfinalvaluetype_has_join_arg,
-    sum_wronginitjoinfinalvaluetype_has_join_arg );
-  */
+  static_assert(
+      Kokkos::Impl::ReduceFunctorHasInit<SumPlain<Scalar, ExecutionSpace>,
+                                         Scalar&>::value == false,
+      "");
+  static_assert(
+      Kokkos::Impl::ReduceFunctorHasInit<
+          SumInitJoinFinalValueType<Scalar, ExecutionSpace>>::value == true,
+      "");
+  static_assert(
+      Kokkos::Impl::ReduceFunctorHasInit<
+          SumInitJoinFinalValueType2<Scalar, ExecutionSpace>>::value == true,
+      "");
+
+  static_assert(
+      Kokkos::Impl::ReduceFunctorHasInit<
+          SumInitJoinFinalValueTypeArray<Scalar, ExecutionSpace>>::value ==
+          true,
+      "");
+
+  static_assert(Kokkos::Impl::ReduceFunctorHasJoin<
+                    SumPlain<Scalar, ExecutionSpace>>::value == false,
+                "");
+  static_assert(
+      Kokkos::Impl::ReduceFunctorHasJoin<
+          SumInitJoinFinalValueType<Scalar, ExecutionSpace>>::value == true,
+      "");
+  static_assert(
+      Kokkos::Impl::ReduceFunctorHasJoin<
+          SumInitJoinFinalValueType2<Scalar, ExecutionSpace>>::value == true,
+      "");
 }
 
 }  // namespace
diff --git a/packages/kokkos/core/unit_test/TestTypeList.cpp b/packages/kokkos/core/unit_test/TestTypeList.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e450d11562819c756334daeea91f3f448df624db
--- /dev/null
+++ b/packages/kokkos/core/unit_test/TestTypeList.cpp
@@ -0,0 +1,73 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <impl/Kokkos_Utilities.hpp>
+
+using TypeList2 = Kokkos::Impl::type_list<void, bool>;
+using TypeList3 = Kokkos::Impl::type_list<char, short, int>;
+using TypeList223 =
+    Kokkos::Impl::type_list<void, bool, void, bool, char, short, int>;
+using TypeList223Void   = Kokkos::Impl::type_list<void, void>;
+using TypeList223NoVoid = Kokkos::Impl::type_list<bool, bool, char, short, int>;
+
+// concat_type_list
+using ConcatTypeList2 = Kokkos::Impl::concat_type_list_t<TypeList2>;
+static_assert(std::is_same<TypeList2, ConcatTypeList2>::value,
+              "concat_type_list of a single type_list failed");
+
+using ConcatTypeList223 =
+    Kokkos::Impl::concat_type_list_t<TypeList2, TypeList2, TypeList3>;
+static_assert(std::is_same<TypeList223, ConcatTypeList223>::value,
+              "concat_type_list of three type_lists failed");
+
+// filter_type_list
+using FilterTypeList223Void =
+    Kokkos::Impl::filter_type_list_t<std::is_void, TypeList223>;
+static_assert(std::is_same<TypeList223Void, FilterTypeList223Void>::value,
+              "filter_type_list with predicate value==true failed");
+
+using FilterTypeList223NoVoid =
+    Kokkos::Impl::filter_type_list_t<std::is_void, TypeList223, false>;
+static_assert(std::is_same<TypeList223NoVoid, FilterTypeList223NoVoid>::value,
+              "filter_type_list with predicate value==false failed");
diff --git a/packages/kokkos/core/unit_test/TestViewAPI.hpp b/packages/kokkos/core/unit_test/TestViewAPI.hpp
index 570281f9fd66a230e69b9bb924a84a0078e12168..73531e6196f0ca145789ef98f680328ece747df9 100644
--- a/packages/kokkos/core/unit_test/TestViewAPI.hpp
+++ b/packages/kokkos/core/unit_test/TestViewAPI.hpp
@@ -1060,12 +1060,12 @@ class TestViewAPI {
     dView4 dx, dy, dz;
     hView4 hx, hy, hz;
 
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_TRUE(dy.data() == nullptr);
-    ASSERT_TRUE(dz.data() == nullptr);
-    ASSERT_TRUE(hx.data() == nullptr);
-    ASSERT_TRUE(hy.data() == nullptr);
-    ASSERT_TRUE(hz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_EQ(dy.data(), nullptr);
+    ASSERT_EQ(dz.data(), nullptr);
+    ASSERT_EQ(hx.data(), nullptr);
+    ASSERT_EQ(hy.data(), nullptr);
+    ASSERT_EQ(hz.data(), nullptr);
     ASSERT_EQ(dx.extent(0), 0u);
     ASSERT_EQ(dy.extent(0), 0u);
     ASSERT_EQ(dz.extent(0), 0u);
@@ -1116,11 +1116,11 @@ class TestViewAPI {
 
     ASSERT_EQ(dx.use_count(), size_t(2));
 
-    ASSERT_FALSE(dx.data() == nullptr);
-    ASSERT_FALSE(const_dx.data() == nullptr);
-    ASSERT_FALSE(unmanaged_dx.data() == nullptr);
-    ASSERT_FALSE(unmanaged_from_ptr_dx.data() == nullptr);
-    ASSERT_FALSE(dy.data() == nullptr);
+    ASSERT_NE(dx.data(), nullptr);
+    ASSERT_NE(const_dx.data(), nullptr);
+    ASSERT_NE(unmanaged_dx.data(), nullptr);
+    ASSERT_NE(unmanaged_from_ptr_dx.data(), nullptr);
+    ASSERT_NE(dy.data(), nullptr);
     ASSERT_NE(dx, dy);
 
     ASSERT_EQ(dx.extent(0), unsigned(N0));
@@ -1257,19 +1257,19 @@ class TestViewAPI {
     ASSERT_NE(dx, dz);
 
     dx = dView4();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_FALSE(dy.data() == nullptr);
-    ASSERT_FALSE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_NE(dy.data(), nullptr);
+    ASSERT_NE(dz.data(), nullptr);
 
     dy = dView4();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_TRUE(dy.data() == nullptr);
-    ASSERT_FALSE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_EQ(dy.data(), nullptr);
+    ASSERT_NE(dz.data(), nullptr);
 
     dz = dView4();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_TRUE(dy.data() == nullptr);
-    ASSERT_TRUE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_EQ(dy.data(), nullptr);
+    ASSERT_EQ(dz.data(), nullptr);
   }
 
   static void run_test_deep_copy_empty() {
@@ -1304,7 +1304,7 @@ class TestViewAPI {
   static void check_auto_conversion_to_const(
       const Kokkos::View<const DataType, device> &arg_const,
       const Kokkos::View<DataType, device> &arg) {
-    ASSERT_TRUE(arg_const == arg);
+    ASSERT_EQ(arg_const, arg);
   }
 
   static void run_test_const() {
@@ -1317,8 +1317,8 @@ class TestViewAPI {
     const_typeX xc = x;
     const_typeR xr = x;
 
-    ASSERT_TRUE(xc == x);
-    ASSERT_TRUE(x == xc);
+    ASSERT_EQ(xc, x);
+    ASSERT_EQ(x, xc);
 
     // For CUDA the constant random access View does not return
     // an lvalue reference due to retrieving through texture cache
@@ -1327,7 +1327,7 @@ class TestViewAPI {
     if (!std::is_same<typename device::execution_space, Kokkos::Cuda>::value)
 #endif
     {
-      ASSERT_TRUE(x.data() == xr.data());
+      ASSERT_EQ(x.data(), xr.data());
     }
 
     // typeX xf = xc; // Setting non-const from const must not compile.
@@ -1440,29 +1440,29 @@ class TestViewAPI {
     const_vector_right_type cvr2 = Kokkos::subview(mv, Kokkos::ALL(), 1);
     const_vector_right_type cvr3 = Kokkos::subview(mv, Kokkos::ALL(), 2);
 
-    ASSERT_TRUE(&v1[0] == &v1(0));
-    ASSERT_TRUE(&v1[0] == &mv(0, 0));
-    ASSERT_TRUE(&v2[0] == &mv(0, 1));
-    ASSERT_TRUE(&v3[0] == &mv(0, 2));
-
-    ASSERT_TRUE(&cv1[0] == &mv(0, 0));
-    ASSERT_TRUE(&cv2[0] == &mv(0, 1));
-    ASSERT_TRUE(&cv3[0] == &mv(0, 2));
-
-    ASSERT_TRUE(&vr1[0] == &mv(0, 0));
-    ASSERT_TRUE(&vr2[0] == &mv(0, 1));
-    ASSERT_TRUE(&vr3[0] == &mv(0, 2));
-
-    ASSERT_TRUE(&cvr1[0] == &mv(0, 0));
-    ASSERT_TRUE(&cvr2[0] == &mv(0, 1));
-    ASSERT_TRUE(&cvr3[0] == &mv(0, 2));
-
-    ASSERT_TRUE(&mv1(0, 0) == &mv(1, 2));
-    ASSERT_TRUE(&mv1(1, 1) == &mv(2, 3));
-    ASSERT_TRUE(&mv1(3, 2) == &mv(4, 4));
-    ASSERT_TRUE(&mvr1(0, 0) == &mv_right(1, 2));
-    ASSERT_TRUE(&mvr1(1, 1) == &mv_right(2, 3));
-    ASSERT_TRUE(&mvr1(3, 2) == &mv_right(4, 4));
+    ASSERT_EQ(&v1[0], &v1(0));
+    ASSERT_EQ(&v1[0], &mv(0, 0));
+    ASSERT_EQ(&v2[0], &mv(0, 1));
+    ASSERT_EQ(&v3[0], &mv(0, 2));
+
+    ASSERT_EQ(&cv1[0], &mv(0, 0));
+    ASSERT_EQ(&cv2[0], &mv(0, 1));
+    ASSERT_EQ(&cv3[0], &mv(0, 2));
+
+    ASSERT_EQ(&vr1[0], &mv(0, 0));
+    ASSERT_EQ(&vr2[0], &mv(0, 1));
+    ASSERT_EQ(&vr3[0], &mv(0, 2));
+
+    ASSERT_EQ(&cvr1[0], &mv(0, 0));
+    ASSERT_EQ(&cvr2[0], &mv(0, 1));
+    ASSERT_EQ(&cvr3[0], &mv(0, 2));
+
+    ASSERT_EQ(&mv1(0, 0), &mv(1, 2));
+    ASSERT_EQ(&mv1(1, 1), &mv(2, 3));
+    ASSERT_EQ(&mv1(3, 2), &mv(4, 4));
+    ASSERT_EQ(&mvr1(0, 0), &mv_right(1, 2));
+    ASSERT_EQ(&mvr1(1, 1), &mv_right(2, 3));
+    ASSERT_EQ(&mvr1(3, 2), &mv_right(4, 4));
 
     const_vector_type c_cv1(v1);
     typename vector_type::const_type c_cv2(v2);
diff --git a/packages/kokkos/core/unit_test/TestViewAPI_e.hpp b/packages/kokkos/core/unit_test/TestViewAPI_e.hpp
index a5dc6cf29a467bd576bd96bca52f90b3db26324b..d4f484a530c952a33b20dada3222180c7785f06a 100644
--- a/packages/kokkos/core/unit_test/TestViewAPI_e.hpp
+++ b/packages/kokkos/core/unit_test/TestViewAPI_e.hpp
@@ -54,23 +54,24 @@ namespace Test {
 TEST(TEST_CATEGORY, view_remap) {
   enum { N0 = 3, N1 = 2, N2 = 8, N3 = 9 };
 
-#ifdef KOKKOS_ENABLE_CUDA
+#if defined(KOKKOS_ENABLE_CUDA)
 #define EXECSPACE                                                     \
   std::conditional<std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value, \
                    Kokkos::CudaHostPinnedSpace, TEST_EXECSPACE>::type
-#else
-#ifdef KOKKOS_ENABLE_HIP
+#elif defined(KOKKOS_ENABLE_HIP)
 #define EXECSPACE                                                     \
   std::conditional<                                                   \
       std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value, \
       Kokkos::Experimental::HIPHostPinnedSpace, TEST_EXECSPACE>::type
-#else
-#if defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_SYCL)
+#elif defined(KOKKOS_ENABLE_SYCL)
+#define EXECSPACE                                                      \
+  std::conditional<                                                    \
+      std::is_same<TEST_EXECSPACE, Kokkos::Experimental::SYCL>::value, \
+      Kokkos::Experimental::SYCLHostUSMSpace, TEST_EXECSPACE>::type
+#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
 #define EXECSPACE Kokkos::HostSpace
 #else
 #define EXECSPACE TEST_EXECSPACE
-#endif
-#endif
 #endif
 
   using output_type =
diff --git a/packages/kokkos/core/unit_test/TestViewCopy_a.hpp b/packages/kokkos/core/unit_test/TestViewCopy_a.hpp
index e25cb9e39ca6fd4c3cd45ef2b60b404ed82c03e7..ced0aa3828cffabfc196fd9dac146718c0f005d3 100644
--- a/packages/kokkos/core/unit_test/TestViewCopy_a.hpp
+++ b/packages/kokkos/core/unit_test/TestViewCopy_a.hpp
@@ -96,10 +96,10 @@ TEST(TEST_CATEGORY, view_copy_tests) {
   auto host = Kokkos::DefaultHostExecutionSpace();
 
   constexpr bool DevExecCanAccessHost =
-      Kokkos::Impl::SpaceAccessibility<typename TEST_EXECSPACE::execution_space,
-                                       Kokkos::HostSpace>::accessible;
+      Kokkos::SpaceAccessibility<typename TEST_EXECSPACE::execution_space,
+                                 Kokkos::HostSpace>::accessible;
 
-  constexpr bool HostExecCanAccessDev = Kokkos::Impl::SpaceAccessibility<
+  constexpr bool HostExecCanAccessDev = Kokkos::SpaceAccessibility<
       typename Kokkos::HostSpace::execution_space,
       typename TEST_EXECSPACE::memory_space>::accessible;
 
diff --git a/packages/kokkos/core/unit_test/TestViewMapping_a.hpp b/packages/kokkos/core/unit_test/TestViewMapping_a.hpp
index fdbda099176c79410c1be6599546f09aba3269dc..974d7c98cafb56c91df55b425913b14c0dfd3ca1 100644
--- a/packages/kokkos/core/unit_test/TestViewMapping_a.hpp
+++ b/packages/kokkos/core/unit_test/TestViewMapping_a.hpp
@@ -768,8 +768,8 @@ void test_view_mapping() {
 
     ASSERT_EQ(vr1.extent(0), N);
 
-    if (Kokkos::Impl::SpaceAccessibility<
-            Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+    if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                   typename Space::memory_space>::accessible) {
       for (int i = 0; i < N; ++i) data[i] = i + 1;
       for (int i = 0; i < N; ++i) ASSERT_EQ(vr1[i], i + 1);
       for (int i = 0; i < N; ++i) ASSERT_EQ(cr1[i], i + 1);
@@ -815,8 +815,8 @@ void test_view_mapping() {
 
     ASSERT_EQ(vr1.extent(0), N);
 
-    if (Kokkos::Impl::SpaceAccessibility<
-            Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+    if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                   typename Space::memory_space>::accessible) {
       for (int i = 0; i < N; ++i) vr1(i) = i + 1;
       for (int i = 0; i < N; ++i) ASSERT_EQ(vr1[i], i + 1);
       for (int i = 0; i < N; ++i) ASSERT_EQ(cr1[i], i + 1);
diff --git a/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp b/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp
index 18db67400d6ea03ecb98e891150cb4a154311982..2a15a84380e7c6d979059a8342c64b9ee68d2eb9 100644
--- a/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp
+++ b/packages/kokkos/core/unit_test/TestViewMapping_subview.hpp
@@ -81,7 +81,7 @@ struct TestViewMappingSubview {
   using DLT  = Kokkos::View<int** * [13][14], Kokkos::LayoutLeft, ExecSpace>;
   using DLS1 = Kokkos::Subview<DLT, range, int, int, int, int>;
 
-#if !defined(KOKKOS_IMPL_CUDA_VERSION_9_WORKAROUND)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000
   static_assert(
       DLS1::rank == 1 &&
           std::is_same<typename DLS1::array_layout, Kokkos::LayoutLeft>::value,
@@ -92,7 +92,7 @@ struct TestViewMappingSubview {
   using DRT  = Kokkos::View<int** * [13][14], Kokkos::LayoutRight, ExecSpace>;
   using DRS1 = Kokkos::Subview<DRT, int, int, int, int, range>;
 
-#if !defined(KOKKOS_IMPL_CUDA_VERSION_9_WORKAROUND)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000
   static_assert(
       DRS1::rank == 1 &&
           std::is_same<typename DRS1::array_layout, Kokkos::LayoutRight>::value,
diff --git a/packages/kokkos/core/unit_test/TestViewSubview.hpp b/packages/kokkos/core/unit_test/TestViewSubview.hpp
index 0125017d93786101e2a23a866effe9d8a5e5242d..93eb5476b57be796f5e1fbcb8e9b3db140bb1615 100644
--- a/packages/kokkos/core/unit_test/TestViewSubview.hpp
+++ b/packages/kokkos/core/unit_test/TestViewSubview.hpp
@@ -184,7 +184,7 @@ void test_auto_1d() {
   Kokkos::deep_copy(X_h, X);
   for (size_type j = 0; j < numCols; ++j) {
     for (size_type i = 0; i < numRows; ++i) {
-      ASSERT_TRUE(X_h(i, j) == ONE);
+      ASSERT_EQ(X_h(i, j), ONE);
     }
   }
 
@@ -194,7 +194,7 @@ void test_auto_1d() {
   Kokkos::deep_copy(X_h, X);
   for (size_type j = 0; j < numCols; ++j) {
     for (size_type i = 0; i < numRows; ++i) {
-      ASSERT_TRUE(X_h(i, j) == ZERO);
+      ASSERT_EQ(X_h(i, j), ZERO);
     }
   }
 
@@ -204,7 +204,7 @@ void test_auto_1d() {
   Kokkos::deep_copy(X_h, X);
   for (size_type j = 0; j < numCols; ++j) {
     for (size_type i = 0; i < numRows; ++i) {
-      ASSERT_TRUE(X_h(i, j) == TWO);
+      ASSERT_EQ(X_h(i, j), TWO);
     }
   }
 
@@ -216,7 +216,7 @@ void test_auto_1d() {
     Kokkos::fence();
     Kokkos::deep_copy(X_h, X);
     for (size_type i = 0; i < numRows; ++i) {
-      ASSERT_TRUE(X_h(i, j) == ZERO);
+      ASSERT_EQ(X_h(i, j), ZERO);
     }
 
     for (size_type jj = 0; jj < numCols; ++jj) {
@@ -226,7 +226,7 @@ void test_auto_1d() {
       Kokkos::fence();
       Kokkos::deep_copy(X_h, X);
       for (size_type i = 0; i < numRows; ++i) {
-        ASSERT_TRUE(X_h(i, jj) == ONE);
+        ASSERT_EQ(X_h(i, jj), ONE);
       }
     }
   }
@@ -240,38 +240,38 @@ void test_1d_strided_assignment_impl(bool a, bool b, bool c, bool d, int n,
   int col = n > 2 ? 2 : 0;
   int row = m > 2 ? 2 : 0;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     if (a) {
       Kokkos::View<double*, LD, Space> l1da =
           Kokkos::subview(l2d, Kokkos::ALL, row);
-      ASSERT_TRUE(&l1da(0) == &l2d(0, row));
+      ASSERT_EQ(&l1da(0), &l2d(0, row));
       if (n > 1) {
-        ASSERT_TRUE(&l1da(1) == &l2d(1, row));
+        ASSERT_EQ(&l1da(1), &l2d(1, row));
       }
     }
 
     if (b && n > 13) {
       Kokkos::View<double*, LD, Space> l1db =
           Kokkos::subview(l2d, std::pair<unsigned, unsigned>(2, 13), row);
-      ASSERT_TRUE(&l1db(0) == &l2d(2, row));
-      ASSERT_TRUE(&l1db(1) == &l2d(3, row));
+      ASSERT_EQ(&l1db(0), &l2d(2, row));
+      ASSERT_EQ(&l1db(1), &l2d(3, row));
     }
 
     if (c) {
       Kokkos::View<double*, LD, Space> l1dc =
           Kokkos::subview(l2d, col, Kokkos::ALL);
-      ASSERT_TRUE(&l1dc(0) == &l2d(col, 0));
+      ASSERT_EQ(&l1dc(0), &l2d(col, 0));
       if (m > 1) {
-        ASSERT_TRUE(&l1dc(1) == &l2d(col, 1));
+        ASSERT_EQ(&l1dc(1), &l2d(col, 1));
       }
     }
 
     if (d && m > 13) {
       Kokkos::View<double*, LD, Space> l1dd =
           Kokkos::subview(l2d, col, std::pair<unsigned, unsigned>(2, 13));
-      ASSERT_TRUE(&l1dd(0) == &l2d(col, 2));
-      ASSERT_TRUE(&l1dd(1) == &l2d(col, 3));
+      ASSERT_EQ(&l1dd(0), &l2d(col, 2));
+      ASSERT_EQ(&l1dd(1), &l2d(col, 3));
     }
   }
 }
@@ -326,8 +326,8 @@ void test_left_0(bool constr) {
   using view_static_8_type =
       Kokkos::View<int[2][3][4][5][2][3][4][5], Kokkos::LayoutLeft, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_static_8_type x_static_8("x_static_left_8");
 
     ASSERT_TRUE(x_static_8.span_is_contiguous());
@@ -337,7 +337,7 @@ void test_left_0(bool constr) {
 
     ASSERT_TRUE(x0.span_is_contiguous());
     ASSERT_EQ(x0.span(), 1);
-    ASSERT_TRUE(&x0() == &x_static_8(0, 0, 0, 0, 0, 0, 0, 0));
+    ASSERT_EQ(&x0(), &x_static_8(0, 0, 0, 0, 0, 0, 0, 0));
 
     Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1;
     make_subview(constr, x1, x_static_8, Kokkos::pair<int, int>(0, 2), 1, 2, 3,
@@ -345,8 +345,8 @@ void test_left_0(bool constr) {
 
     ASSERT_TRUE(x1.span_is_contiguous());
     ASSERT_EQ(x1.span(), 2);
-    ASSERT_TRUE(&x1(0) == &x_static_8(0, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&x1(1) == &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x1(0), &x_static_8(0, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x1(1), &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
 
     Kokkos::View<int*, Kokkos::LayoutLeft, Space> x_deg1;
     make_subview(constr, x_deg1, x_static_8, Kokkos::pair<int, int>(0, 0), 1, 2,
@@ -369,10 +369,10 @@ void test_left_0(bool constr) {
                  Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
     ASSERT_TRUE(!x2.span_is_contiguous());
-    ASSERT_TRUE(&x2(0, 0) == &x_static_8(0, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&x2(1, 0) == &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&x2(0, 1) == &x_static_8(0, 1, 2, 3, 1, 1, 2, 3));
-    ASSERT_TRUE(&x2(1, 1) == &x_static_8(1, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&x2(0, 0), &x_static_8(0, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x2(1, 0), &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x2(0, 1), &x_static_8(0, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&x2(1, 1), &x_static_8(1, 1, 2, 3, 1, 1, 2, 3));
 
     // Kokkos::View< int**, Kokkos::LayoutLeft, Space > error_2 =
     Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2;
@@ -380,10 +380,10 @@ void test_left_0(bool constr) {
                  Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
     ASSERT_TRUE(!sx2.span_is_contiguous());
-    ASSERT_TRUE(&sx2(0, 0) == &x_static_8(1, 0, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 0) == &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(0, 1) == &x_static_8(1, 0, 2, 3, 1, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 1) == &x_static_8(1, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 0), &x_static_8(1, 0, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 0), &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 1), &x_static_8(1, 0, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 1), &x_static_8(1, 1, 2, 3, 1, 1, 2, 3));
 
     Kokkos::View<int****, Kokkos::LayoutStride, Space> sx4;
     make_subview(constr, sx4, x_static_8, 0,
@@ -402,9 +402,8 @@ void test_left_0(bool constr) {
       for (int i1 = 0; i1 < (int)sx4.extent(1); ++i1)
         for (int i2 = 0; i2 < (int)sx4.extent(2); ++i2)
           for (int i3 = 0; i3 < (int)sx4.extent(3); ++i3) {
-            ASSERT_TRUE(&sx4(i0, i1, i2, i3) == &x_static_8(0, 0 + i0, 1,
-                                                            1 + i1, 1, 0 + i2,
-                                                            2, 2 + i3));
+            ASSERT_EQ(&sx4(i0, i1, i2, i3),
+                      &x_static_8(0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3));
           }
   }
 }
@@ -420,8 +419,8 @@ void test_left_1(bool use_constr) {
   using view_type =
       Kokkos::View<int*** * [2][3][4][5], Kokkos::LayoutLeft, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_type x8("x_left_8", 2, 3, 4, 5);
 
     ASSERT_TRUE(x8.span_is_contiguous());
@@ -430,15 +429,15 @@ void test_left_1(bool use_constr) {
     make_subview(use_constr, x0, x8, 0, 0, 0, 0, 0, 0, 0, 0);
 
     ASSERT_TRUE(x0.span_is_contiguous());
-    ASSERT_TRUE(&x0() == &x8(0, 0, 0, 0, 0, 0, 0, 0));
+    ASSERT_EQ(&x0(), &x8(0, 0, 0, 0, 0, 0, 0, 0));
 
     Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1;
     make_subview(use_constr, x1, x8, Kokkos::pair<int, int>(0, 2), 1, 2, 3, 0,
                  1, 2, 3);
 
     ASSERT_TRUE(x1.span_is_contiguous());
-    ASSERT_TRUE(&x1(0) == &x8(0, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&x1(1) == &x8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x1(0), &x8(0, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x1(1), &x8(1, 1, 2, 3, 0, 1, 2, 3));
 
     Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1_deg1;
     make_subview(use_constr, x1_deg1, x8, Kokkos::pair<int, int>(0, 0), 1, 2, 3,
@@ -461,10 +460,10 @@ void test_left_1(bool use_constr) {
                  Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
     ASSERT_TRUE(!x2.span_is_contiguous());
-    ASSERT_TRUE(&x2(0, 0) == &x8(0, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&x2(1, 0) == &x8(1, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&x2(0, 1) == &x8(0, 1, 2, 3, 1, 1, 2, 3));
-    ASSERT_TRUE(&x2(1, 1) == &x8(1, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&x2(0, 0), &x8(0, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x2(1, 0), &x8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x2(0, 1), &x8(0, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&x2(1, 1), &x8(1, 1, 2, 3, 1, 1, 2, 3));
 
     Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2_deg2;
     make_subview(use_constr, x2_deg2, x8, Kokkos::pair<int, int>(2, 2), 2, 3, 4,
@@ -477,10 +476,10 @@ void test_left_1(bool use_constr) {
                  Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
     ASSERT_TRUE(!sx2.span_is_contiguous());
-    ASSERT_TRUE(&sx2(0, 0) == &x8(1, 0, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 0) == &x8(1, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(0, 1) == &x8(1, 0, 2, 3, 1, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 1) == &x8(1, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 0), &x8(1, 0, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 0), &x8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 1), &x8(1, 0, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 1), &x8(1, 1, 2, 3, 1, 1, 2, 3));
 
     Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2_deg;
     make_subview(use_constr, sx2, x8, 1, Kokkos::pair<int, int>(0, 0), 2, 3,
@@ -520,8 +519,8 @@ template <class Space>
 void test_left_2() {
   using view_type = Kokkos::View<int****, Kokkos::LayoutLeft, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_type x4("x4", 2, 3, 4, 5);
 
     ASSERT_TRUE(x4.span_is_contiguous());
@@ -530,35 +529,35 @@ void test_left_2() {
         Kokkos::subview(x4, 0, 0, 0, 0);
 
     ASSERT_TRUE(x0.span_is_contiguous());
-    ASSERT_TRUE(&x0() == &x4(0, 0, 0, 0));
+    ASSERT_EQ(&x0(), &x4(0, 0, 0, 0));
 
     Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1 =
         Kokkos::subview(x4, Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
     ASSERT_TRUE(x1.span_is_contiguous());
-    ASSERT_TRUE(&x1(0) == &x4(0, 1, 2, 3));
-    ASSERT_TRUE(&x1(1) == &x4(1, 1, 2, 3));
+    ASSERT_EQ(&x1(0), &x4(0, 1, 2, 3));
+    ASSERT_EQ(&x1(1), &x4(1, 1, 2, 3));
 
     Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2 = Kokkos::subview(
         x4, Kokkos::pair<int, int>(0, 2), 1, Kokkos::pair<int, int>(1, 3), 2);
 
     ASSERT_TRUE(!x2.span_is_contiguous());
-    ASSERT_TRUE(&x2(0, 0) == &x4(0, 1, 1, 2));
-    ASSERT_TRUE(&x2(1, 0) == &x4(1, 1, 1, 2));
-    ASSERT_TRUE(&x2(0, 1) == &x4(0, 1, 2, 2));
-    ASSERT_TRUE(&x2(1, 1) == &x4(1, 1, 2, 2));
+    ASSERT_EQ(&x2(0, 0), &x4(0, 1, 1, 2));
+    ASSERT_EQ(&x2(1, 0), &x4(1, 1, 1, 2));
+    ASSERT_EQ(&x2(0, 1), &x4(0, 1, 2, 2));
+    ASSERT_EQ(&x2(1, 1), &x4(1, 1, 2, 2));
 
     // Kokkos::View< int**, Kokkos::LayoutLeft, Space > error_2 =
     Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2 = Kokkos::subview(
         x4, 1, Kokkos::pair<int, int>(0, 2), 2, Kokkos::pair<int, int>(1, 4));
 
     ASSERT_TRUE(!sx2.span_is_contiguous());
-    ASSERT_TRUE(&sx2(0, 0) == &x4(1, 0, 2, 1));
-    ASSERT_TRUE(&sx2(1, 0) == &x4(1, 1, 2, 1));
-    ASSERT_TRUE(&sx2(0, 1) == &x4(1, 0, 2, 2));
-    ASSERT_TRUE(&sx2(1, 1) == &x4(1, 1, 2, 2));
-    ASSERT_TRUE(&sx2(0, 2) == &x4(1, 0, 2, 3));
-    ASSERT_TRUE(&sx2(1, 2) == &x4(1, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 0), &x4(1, 0, 2, 1));
+    ASSERT_EQ(&sx2(1, 0), &x4(1, 1, 2, 1));
+    ASSERT_EQ(&sx2(0, 1), &x4(1, 0, 2, 2));
+    ASSERT_EQ(&sx2(1, 1), &x4(1, 1, 2, 2));
+    ASSERT_EQ(&sx2(0, 2), &x4(1, 0, 2, 3));
+    ASSERT_EQ(&sx2(1, 2), &x4(1, 1, 2, 3));
 
     Kokkos::View<int****, Kokkos::LayoutStride, Space> sx4 =
         Kokkos::subview(x4, Kokkos::pair<int, int>(1, 2) /* of [2] */
@@ -586,8 +585,8 @@ template <class Space>
 void test_left_3() {
   using view_type = Kokkos::View<int**, Kokkos::LayoutLeft, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_type xm("x4", 10, 5);
 
     ASSERT_TRUE(xm.span_is_contiguous());
@@ -595,14 +594,14 @@ void test_left_3() {
     Kokkos::View<int, Kokkos::LayoutLeft, Space> x0 = Kokkos::subview(xm, 5, 3);
 
     ASSERT_TRUE(x0.span_is_contiguous());
-    ASSERT_TRUE(&x0() == &xm(5, 3));
+    ASSERT_EQ(&x0(), &xm(5, 3));
 
     Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1 =
         Kokkos::subview(xm, Kokkos::ALL, 3);
 
     ASSERT_TRUE(x1.span_is_contiguous());
     for (int i = 0; i < int(xm.extent(0)); ++i) {
-      ASSERT_TRUE(&x1(i) == &xm(i, 3));
+      ASSERT_EQ(&x1(i), &xm(i, 3));
     }
 
     Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2 =
@@ -611,7 +610,7 @@ void test_left_3() {
     ASSERT_TRUE(!x2.span_is_contiguous());
     for (int j = 0; j < int(x2.extent(1)); ++j)
       for (int i = 0; i < int(x2.extent(0)); ++i) {
-        ASSERT_TRUE(&x2(i, j) == &xm(1 + i, j));
+        ASSERT_EQ(&x2(i, j), &xm(1 + i, j));
       }
 
     Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2c =
@@ -620,20 +619,20 @@ void test_left_3() {
     ASSERT_TRUE(x2c.span_is_contiguous());
     for (int j = 0; j < int(x2c.extent(1)); ++j)
       for (int i = 0; i < int(x2c.extent(0)); ++i) {
-        ASSERT_TRUE(&x2c(i, j) == &xm(i, 2 + j));
+        ASSERT_EQ(&x2c(i, j), &xm(i, 2 + j));
       }
 
     Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2_n1 =
         Kokkos::subview(xm, std::pair<int, int>(1, 1), Kokkos::ALL);
 
-    ASSERT_TRUE(x2_n1.extent(0) == 0);
-    ASSERT_TRUE(x2_n1.extent(1) == xm.extent(1));
+    ASSERT_EQ(x2_n1.extent(0), 0);
+    ASSERT_EQ(x2_n1.extent(1), xm.extent(1));
 
     Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2_n2 =
         Kokkos::subview(xm, Kokkos::ALL, std::pair<int, int>(1, 1));
 
-    ASSERT_TRUE(x2_n2.extent(0) == xm.extent(0));
-    ASSERT_TRUE(x2_n2.extent(1) == 0);
+    ASSERT_EQ(x2_n2.extent(0), xm.extent(0));
+    ASSERT_EQ(x2_n2.extent(1), 0);
   }
 }
 
@@ -644,46 +643,46 @@ void test_right_0(bool use_constr) {
   using view_static_8_type =
       Kokkos::View<int[2][3][4][5][2][3][4][5], Kokkos::LayoutRight, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_static_8_type x_static_8("x_static_right_8");
 
     Kokkos::View<int, Kokkos::LayoutRight, Space> x0;
     make_subview(use_constr, x0, x_static_8, 0, 0, 0, 0, 0, 0, 0, 0);
 
-    ASSERT_TRUE(&x0() == &x_static_8(0, 0, 0, 0, 0, 0, 0, 0));
+    ASSERT_EQ(&x0(), &x_static_8(0, 0, 0, 0, 0, 0, 0, 0));
 
     Kokkos::View<int*, Kokkos::LayoutRight, Space> x1;
     make_subview(use_constr, x1, x_static_8, 0, 1, 2, 3, 0, 1, 2,
                  Kokkos::pair<int, int>(1, 3));
 
-    ASSERT_TRUE(x1.extent(0) == 2);
-    ASSERT_TRUE(&x1(0) == &x_static_8(0, 1, 2, 3, 0, 1, 2, 1));
-    ASSERT_TRUE(&x1(1) == &x_static_8(0, 1, 2, 3, 0, 1, 2, 2));
+    ASSERT_EQ(x1.extent(0), 2);
+    ASSERT_EQ(&x1(0), &x_static_8(0, 1, 2, 3, 0, 1, 2, 1));
+    ASSERT_EQ(&x1(1), &x_static_8(0, 1, 2, 3, 0, 1, 2, 2));
 
     Kokkos::View<int**, Kokkos::LayoutRight, Space> x2;
     make_subview(use_constr, x2, x_static_8, 0, 1, 2,
                  Kokkos::pair<int, int>(1, 3), 0, 1, 2,
                  Kokkos::pair<int, int>(1, 3));
 
-    ASSERT_TRUE(x2.extent(0) == 2);
-    ASSERT_TRUE(x2.extent(1) == 2);
-    ASSERT_TRUE(&x2(0, 0) == &x_static_8(0, 1, 2, 1, 0, 1, 2, 1));
-    ASSERT_TRUE(&x2(1, 0) == &x_static_8(0, 1, 2, 2, 0, 1, 2, 1));
-    ASSERT_TRUE(&x2(0, 1) == &x_static_8(0, 1, 2, 1, 0, 1, 2, 2));
-    ASSERT_TRUE(&x2(1, 1) == &x_static_8(0, 1, 2, 2, 0, 1, 2, 2));
+    ASSERT_EQ(x2.extent(0), 2);
+    ASSERT_EQ(x2.extent(1), 2);
+    ASSERT_EQ(&x2(0, 0), &x_static_8(0, 1, 2, 1, 0, 1, 2, 1));
+    ASSERT_EQ(&x2(1, 0), &x_static_8(0, 1, 2, 2, 0, 1, 2, 1));
+    ASSERT_EQ(&x2(0, 1), &x_static_8(0, 1, 2, 1, 0, 1, 2, 2));
+    ASSERT_EQ(&x2(1, 1), &x_static_8(0, 1, 2, 2, 0, 1, 2, 2));
 
     // Kokkos::View< int**, Kokkos::LayoutRight, Space > error_2 =
     Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2;
     make_subview(use_constr, sx2, x_static_8, 1, Kokkos::pair<int, int>(0, 2),
                  2, 3, Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
-    ASSERT_TRUE(sx2.extent(0) == 2);
-    ASSERT_TRUE(sx2.extent(1) == 2);
-    ASSERT_TRUE(&sx2(0, 0) == &x_static_8(1, 0, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 0) == &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(0, 1) == &x_static_8(1, 0, 2, 3, 1, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 1) == &x_static_8(1, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(sx2.extent(0), 2);
+    ASSERT_EQ(sx2.extent(1), 2);
+    ASSERT_EQ(&sx2(0, 0), &x_static_8(1, 0, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 0), &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 1), &x_static_8(1, 0, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 1), &x_static_8(1, 1, 2, 3, 1, 1, 2, 3));
 
     Kokkos::View<int****, Kokkos::LayoutStride, Space> sx4;
     make_subview(use_constr, sx4, x_static_8, 0,
@@ -696,17 +695,16 @@ void test_right_0(bool use_constr) {
                  2, Kokkos::pair<int, int>(2, 4) /* of [5] */
     );
 
-    ASSERT_TRUE(sx4.extent(0) == 2);
-    ASSERT_TRUE(sx4.extent(1) == 2);
-    ASSERT_TRUE(sx4.extent(2) == 2);
-    ASSERT_TRUE(sx4.extent(3) == 2);
+    ASSERT_EQ(sx4.extent(0), 2);
+    ASSERT_EQ(sx4.extent(1), 2);
+    ASSERT_EQ(sx4.extent(2), 2);
+    ASSERT_EQ(sx4.extent(3), 2);
     for (int i0 = 0; i0 < (int)sx4.extent(0); ++i0)
       for (int i1 = 0; i1 < (int)sx4.extent(1); ++i1)
         for (int i2 = 0; i2 < (int)sx4.extent(2); ++i2)
           for (int i3 = 0; i3 < (int)sx4.extent(3); ++i3) {
-            ASSERT_TRUE(&sx4(i0, i1, i2, i3) == &x_static_8(0, 0 + i0, 1,
-                                                            1 + i1, 1, 0 + i2,
-                                                            2, 2 + i3));
+            ASSERT_EQ(&sx4(i0, i1, i2, i3),
+                      &x_static_8(0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3));
           }
   }
 }
@@ -722,21 +720,21 @@ void test_right_1(bool use_constr) {
   using view_type =
       Kokkos::View<int*** * [2][3][4][5], Kokkos::LayoutRight, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_type x8("x_right_8", 2, 3, 4, 5);
 
     Kokkos::View<int, Kokkos::LayoutRight, Space> x0;
     make_subview(use_constr, x0, x8, 0, 0, 0, 0, 0, 0, 0, 0);
 
-    ASSERT_TRUE(&x0() == &x8(0, 0, 0, 0, 0, 0, 0, 0));
+    ASSERT_EQ(&x0(), &x8(0, 0, 0, 0, 0, 0, 0, 0));
 
     Kokkos::View<int*, Kokkos::LayoutRight, Space> x1;
     make_subview(use_constr, x1, x8, 0, 1, 2, 3, 0, 1, 2,
                  Kokkos::pair<int, int>(1, 3));
 
-    ASSERT_TRUE(&x1(0) == &x8(0, 1, 2, 3, 0, 1, 2, 1));
-    ASSERT_TRUE(&x1(1) == &x8(0, 1, 2, 3, 0, 1, 2, 2));
+    ASSERT_EQ(&x1(0), &x8(0, 1, 2, 3, 0, 1, 2, 1));
+    ASSERT_EQ(&x1(1), &x8(0, 1, 2, 3, 0, 1, 2, 2));
 
     Kokkos::View<int*, Kokkos::LayoutRight, Space> x1_deg1;
     make_subview(use_constr, x1_deg1, x8, 0, 1, 2, 3, 0, 1, 2,
@@ -747,10 +745,10 @@ void test_right_1(bool use_constr) {
     make_subview(use_constr, x2, x8, 0, 1, 2, Kokkos::pair<int, int>(1, 3), 0,
                  1, 2, Kokkos::pair<int, int>(1, 3));
 
-    ASSERT_TRUE(&x2(0, 0) == &x8(0, 1, 2, 1, 0, 1, 2, 1));
-    ASSERT_TRUE(&x2(1, 0) == &x8(0, 1, 2, 2, 0, 1, 2, 1));
-    ASSERT_TRUE(&x2(0, 1) == &x8(0, 1, 2, 1, 0, 1, 2, 2));
-    ASSERT_TRUE(&x2(1, 1) == &x8(0, 1, 2, 2, 0, 1, 2, 2));
+    ASSERT_EQ(&x2(0, 0), &x8(0, 1, 2, 1, 0, 1, 2, 1));
+    ASSERT_EQ(&x2(1, 0), &x8(0, 1, 2, 2, 0, 1, 2, 1));
+    ASSERT_EQ(&x2(0, 1), &x8(0, 1, 2, 1, 0, 1, 2, 2));
+    ASSERT_EQ(&x2(1, 1), &x8(0, 1, 2, 2, 0, 1, 2, 2));
 
     Kokkos::View<int**, Kokkos::LayoutRight, Space> x2_deg2;
     make_subview(use_constr, x2_deg2, x8, 0, 1, 2, Kokkos::pair<int, int>(1, 3),
@@ -762,10 +760,10 @@ void test_right_1(bool use_constr) {
     make_subview(use_constr, sx2, x8, 1, Kokkos::pair<int, int>(0, 2), 2, 3,
                  Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
-    ASSERT_TRUE(&sx2(0, 0) == &x8(1, 0, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 0) == &x8(1, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(0, 1) == &x8(1, 0, 2, 3, 1, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 1) == &x8(1, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 0), &x8(1, 0, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 0), &x8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 1), &x8(1, 0, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 1), &x8(1, 1, 2, 3, 1, 1, 2, 3));
 
     Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2_deg;
     make_subview(use_constr, sx2_deg, x8, 1, Kokkos::pair<int, int>(0, 2), 2, 3,
@@ -803,8 +801,8 @@ template <class Space>
 void test_right_3() {
   using view_type = Kokkos::View<int**, Kokkos::LayoutRight, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_type xm("x4", 10, 5);
 
     ASSERT_TRUE(xm.span_is_contiguous());
@@ -813,14 +811,14 @@ void test_right_3() {
         Kokkos::subview(xm, 5, 3);
 
     ASSERT_TRUE(x0.span_is_contiguous());
-    ASSERT_TRUE(&x0() == &xm(5, 3));
+    ASSERT_EQ(&x0(), &xm(5, 3));
 
     Kokkos::View<int*, Kokkos::LayoutRight, Space> x1 =
         Kokkos::subview(xm, 3, Kokkos::ALL);
 
     ASSERT_TRUE(x1.span_is_contiguous());
     for (int i = 0; i < int(xm.extent(1)); ++i) {
-      ASSERT_TRUE(&x1(i) == &xm(3, i));
+      ASSERT_EQ(&x1(i), &xm(3, i));
     }
 
     Kokkos::View<int**, Kokkos::LayoutRight, Space> x2c =
@@ -829,7 +827,7 @@ void test_right_3() {
     ASSERT_TRUE(x2c.span_is_contiguous());
     for (int j = 0; j < int(x2c.extent(1)); ++j)
       for (int i = 0; i < int(x2c.extent(0)); ++i) {
-        ASSERT_TRUE(&x2c(i, j) == &xm(1 + i, j));
+        ASSERT_EQ(&x2c(i, j), &xm(1 + i, j));
       }
 
     Kokkos::View<int**, Kokkos::LayoutRight, Space> x2 =
@@ -838,20 +836,20 @@ void test_right_3() {
     ASSERT_TRUE(!x2.span_is_contiguous());
     for (int j = 0; j < int(x2.extent(1)); ++j)
       for (int i = 0; i < int(x2.extent(0)); ++i) {
-        ASSERT_TRUE(&x2(i, j) == &xm(i, 2 + j));
+        ASSERT_EQ(&x2(i, j), &xm(i, 2 + j));
       }
 
     Kokkos::View<int**, Kokkos::LayoutRight, Space> x2_n1 =
         Kokkos::subview(xm, std::pair<int, int>(1, 1), Kokkos::ALL);
 
-    ASSERT_TRUE(x2_n1.extent(0) == 0);
-    ASSERT_TRUE(x2_n1.extent(1) == xm.extent(1));
+    ASSERT_EQ(x2_n1.extent(0), 0);
+    ASSERT_EQ(x2_n1.extent(1), xm.extent(1));
 
     Kokkos::View<int**, Kokkos::LayoutRight, Space> x2_n2 =
         Kokkos::subview(xm, Kokkos::ALL, std::pair<int, int>(1, 1));
 
-    ASSERT_TRUE(x2_n2.extent(0) == xm.extent(0));
-    ASSERT_TRUE(x2_n2.extent(1) == 0);
+    ASSERT_EQ(x2_n2.extent(0), xm.extent(0));
+    ASSERT_EQ(x2_n2.extent(1), 0);
   }
 }
 
@@ -979,7 +977,7 @@ struct CheckSubviewCorrectness_1D_1D {
     int errors = 0;
     Kokkos::parallel_reduce("CheckSubView_1D_1D", policy_t(0, b.size()), *this,
                             errors);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -1005,7 +1003,7 @@ struct CheckSubviewCorrectness_1D_2D {
     int errors = 0;
     Kokkos::parallel_reduce("CheckSubView_1D_2D", policy_t(0, b.size()), *this,
                             errors);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -1033,7 +1031,7 @@ struct CheckSubviewCorrectness_2D_3D {
     int errors = 0;
     Kokkos::parallel_reduce("CheckSubView_2D_3D", policy_t(0, b.size()), *this,
                             errors);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -1068,7 +1066,7 @@ struct CheckSubviewCorrectness_3D_3D {
     int errors = 0;
     Kokkos::parallel_reduce("CheckSubView_3D_3D", policy_t(0, b.size()), *this,
                             errors);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -1107,7 +1105,7 @@ struct CheckSubviewCorrectness_3D_4D {
     int errors = 0;
     Kokkos::parallel_reduce("CheckSubView_3D_4D", policy_t(0, b.size()), *this,
                             errors);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -1165,7 +1163,7 @@ struct CheckSubviewCorrectness_3D_5D {
     int errors = 0;
     Kokkos::parallel_reduce("CheckSubView_3D_5D", policy_t(0, b.size()), *this,
                             errors);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
diff --git a/packages/kokkos/core/unit_test/TestView_64bit.hpp b/packages/kokkos/core/unit_test/TestView_64bit.hpp
index 50626718b5774ddefa03a453402564986e831ed1..174a07ac1d5b6ca6e9ddc145617ea86bf51de314 100644
--- a/packages/kokkos/core/unit_test/TestView_64bit.hpp
+++ b/packages/kokkos/core/unit_test/TestView_64bit.hpp
@@ -49,9 +49,9 @@ namespace Test {
 template <class Device>
 void test_64bit() {
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
-  // FIXME_SYCL The SYCL CUDA backend throws an error
+  // We are running out of device memory on Intel GPUs
 #ifdef KOKKOS_ENABLE_SYCL
-  int64_t N = 1000000000;
+  int64_t N = 4000000000;
 #else
   int64_t N = 5000000000;
 #endif
@@ -60,7 +60,7 @@ void test_64bit() {
     Kokkos::parallel_reduce(
         Kokkos::RangePolicy<typename Device::execution_space,
                             Kokkos::IndexType<int64_t>>(0, N),
-        KOKKOS_LAMBDA(const int64_t& /*i*/, int64_t& lsum) { lsum += 1; }, sum);
+        KOKKOS_LAMBDA(const int64_t&, int64_t& lsum) { lsum += 1; }, sum);
     ASSERT_EQ(N, sum);
   }
   {
@@ -111,7 +111,12 @@ void test_64bit() {
     ASSERT_EQ(N0 * N1, sum);
   }
   {
-    int N0    = 1024 * 1024 * 1500;
+// We are running out of device memory on Intel GPUs
+#ifdef KOKKOS_ENABLE_SYCL
+    int64_t N0 = 1024 * 1024 * 900;
+#else
+    int N0 = 1024 * 1024 * 1500;
+#endif
     int64_t P = 1713091;
     Kokkos::View<int*, Device> a("A", N0);
     Kokkos::parallel_for(
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_SharedAlloc.cpp b/packages/kokkos/core/unit_test/category_files/TestSYCLHostUSM_Category.hpp
similarity index 89%
rename from packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_SharedAlloc.cpp
rename to packages/kokkos/core/unit_test/category_files/TestSYCLHostUSM_Category.hpp
index 4228b5181a0ccd68dfde87f71f92fd0a471a8e96..0287829fd61e88d19449e7e82d0b9727a5413fb3 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_SharedAlloc.cpp
+++ b/packages/kokkos/core/unit_test/category_files/TestSYCLHostUSM_Category.hpp
@@ -42,5 +42,12 @@
 //@HEADER
 */
 
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestSharedAlloc.hpp>
+#ifndef KOKKOS_TEST_SYCL_HOST_USM_SPACE_HPP
+#define KOKKOS_TEST_SYCL_HOST_USM_SPACE_HPP
+
+#include <gtest/gtest.h>
+
+#define TEST_CATEGORY sycl_host_usm
+#define TEST_EXECSPACE Kokkos::Experimental::SYCLHostUSMSpace
+
+#endif
diff --git a/packages/kokkos/core/unit_test/category_files/TestSYCLSharedUSMSpace_Category.hpp b/packages/kokkos/core/unit_test/category_files/TestSYCLSharedUSM_Category.hpp
similarity index 100%
rename from packages/kokkos/core/unit_test/category_files/TestSYCLSharedUSMSpace_Category.hpp
rename to packages/kokkos/core/unit_test/category_files/TestSYCLSharedUSM_Category.hpp
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_d.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_d.cpp
deleted file mode 100644
index bab29610a3d4ad2e812405ba96ed06c7e2dfb3b8..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_d.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewAPI_d.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_e.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_e.cpp
deleted file mode 100644
index fd227186d5668239b9d9fe3f6a1ae2b3d5510b32..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_e.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewAPI_e.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_a.cpp
deleted file mode 100644
index 669761df979cfd1458f1d5ea78acfb5738af0d38..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewCopy_a.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_b.cpp
deleted file mode 100644
index d367fd7e051f49495ce747f6f490bad795f94d86..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewCopy_b.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_a.cpp
deleted file mode 100644
index 01b284b2f562299b4f23cc197693c2baad40f38e..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewMapping_a.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_b.cpp
deleted file mode 100644
index e15228b1d772a5dba97ee434e17fdb18188a709a..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewMapping_b.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_subview.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_subview.cpp
deleted file mode 100644
index 52bbd42f292f4b865def36856913dfc6bbe0028f..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_subview.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewMapping_subview.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_a.cpp
deleted file mode 100644
index 4aeac8f13f4d28672c671a51c1eacfedbf0e92fd..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewAPI_a.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_b.cpp
deleted file mode 100644
index e5cb0103424fd022290998307f086aedaea0cb29..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewAPI_b.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_c.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_c.cpp
deleted file mode 100644
index a52fcb833ed2a0e959a25e36195460c1ed914a78..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_c.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewAPI_c.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_d.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_d.cpp
deleted file mode 100644
index e345cd9667526671ef898a0d1247343b47f6296c..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_d.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewAPI_d.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_e.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_e.cpp
deleted file mode 100644
index 61547df4f523969f8c93da8315fddb4467e5ade9..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_e.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewAPI_e.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_a.cpp
deleted file mode 100644
index 75a769bb947485e6e7459c1cb95b7b3b1c26f9b1..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewCopy_a.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_b.cpp
deleted file mode 100644
index 7d09f5c9f397b3723599aec64c3c50a6aa77a769..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewCopy_b.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_a.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_a.cpp
deleted file mode 100644
index ea03f43bd69a318095e6277f4db226241fc9a482..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewMapping_a.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_b.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_b.cpp
deleted file mode 100644
index 1f754e8f4996cbc3c0fbefd7000bff65451b19f0..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewMapping_b.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_subview.cpp b/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_subview.cpp
deleted file mode 100644
index 4af7057d2aa47db99a8325159e0ee737feff7767..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_subview.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewMapping_subview.hpp>
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp
index ee7181e1180fdb887a87190605565e42e897409c..d09d4edfdad12e7db332c279398247bfda9ca80a 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp
@@ -60,7 +60,7 @@ __global__ void offset(int* p) {
 // Cuda.
 TEST(cuda, raw_cuda_interop) {
   int* p;
-  CUDA_SAFE_CALL(cudaMalloc(&p, sizeof(int) * 100));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(&p, sizeof(int) * 100));
   Kokkos::InitArguments arguments{-1, -1, -1, false};
   Kokkos::initialize(arguments);
 
@@ -70,11 +70,11 @@ TEST(cuda, raw_cuda_interop) {
   Kokkos::finalize();
 
   offset<<<100, 64>>>(p);
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
 
   std::array<int, 100> h_p;
   cudaMemcpy(h_p.data(), p, sizeof(int) * 100, cudaMemcpyDefault);
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
   int64_t sum        = 0;
   int64_t sum_expect = 0;
   for (int i = 0; i < 100; i++) {
@@ -83,6 +83,6 @@ TEST(cuda, raw_cuda_interop) {
   }
 
   ASSERT_EQ(sum, sum_expect);
-  CUDA_SAFE_CALL(cudaFree(p));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(p));
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp
index 526b985c00f2eec2eab6cafb8e862eff5024d575..13388b4c5472c5441d33e9fbfb8f99a995bdcdf0 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp
@@ -99,12 +99,12 @@ TEST(cuda, raw_cuda_streams) {
   }
   Kokkos::finalize();
   offset_streams<<<100, 64, 0, stream>>>(p);
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
   cudaStreamDestroy(stream);
 
   int h_p[100];
   cudaMemcpy(h_p, p, sizeof(int) * 100, cudaMemcpyDefault);
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
   int64_t sum        = 0;
   int64_t sum_expect = 0;
   for (int i = 0; i < 100; i++) {
diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
index 646b37908654d2af6327158cb49f7d4257e8f8bf..2fa61d43120d338bac3c475fc7cf35e9aeb06776 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
+++ b/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
@@ -181,37 +181,33 @@ TEST(cuda, space_access) {
   //--------------------------------------
 
   static_assert(
-      !Kokkos::Impl::SpaceAccessibility<Kokkos::Cuda,
-                                        Kokkos::HostSpace>::accessible,
+      !Kokkos::SpaceAccessibility<Kokkos::Cuda, Kokkos::HostSpace>::accessible,
       "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<Kokkos::Cuda,
-                                                 Kokkos::CudaSpace>::accessible,
-                "");
-
   static_assert(
-      Kokkos::Impl::SpaceAccessibility<Kokkos::Cuda,
-                                       Kokkos::CudaUVMSpace>::accessible,
+      Kokkos::SpaceAccessibility<Kokkos::Cuda, Kokkos::CudaSpace>::accessible,
       "");
 
-  static_assert(
-      Kokkos::Impl::SpaceAccessibility<Kokkos::Cuda,
-                                       Kokkos::CudaHostPinnedSpace>::accessible,
-      "");
+  static_assert(Kokkos::SpaceAccessibility<Kokkos::Cuda,
+                                           Kokkos::CudaUVMSpace>::accessible,
+                "");
 
   static_assert(
-      !Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,
-                                        Kokkos::CudaSpace>::accessible,
+      Kokkos::SpaceAccessibility<Kokkos::Cuda,
+                                 Kokkos::CudaHostPinnedSpace>::accessible,
       "");
 
-  static_assert(
-      Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,
-                                       Kokkos::CudaUVMSpace>::accessible,
-      "");
+  static_assert(!Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                            Kokkos::CudaSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                           Kokkos::CudaUVMSpace>::accessible,
+                "");
 
   static_assert(
-      Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,
-                                       Kokkos::CudaHostPinnedSpace>::accessible,
+      Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 Kokkos::CudaHostPinnedSpace>::accessible,
       "");
 
   static_assert(std::is_same<Kokkos::Impl::HostMirror<Kokkos::CudaSpace>::Space,
@@ -235,23 +231,23 @@ TEST(cuda, space_access) {
                                             Kokkos::CudaUVMSpace>>::value,
                 "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
-                    Kokkos::Impl::HostMirror<Kokkos::Cuda>::Space,
-                    Kokkos::HostSpace>::accessible,
-                "");
+  static_assert(
+      Kokkos::SpaceAccessibility<Kokkos::Impl::HostMirror<Kokkos::Cuda>::Space,
+                                 Kokkos::HostSpace>::accessible,
+      "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
+  static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::Impl::HostMirror<Kokkos::CudaSpace>::Space,
                     Kokkos::HostSpace>::accessible,
                 "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
+  static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::Impl::HostMirror<Kokkos::CudaUVMSpace>::Space,
                     Kokkos::HostSpace>::accessible,
                 "");
 
   static_assert(
-      Kokkos::Impl::SpaceAccessibility<
+      Kokkos::SpaceAccessibility<
           Kokkos::Impl::HostMirror<Kokkos::CudaHostPinnedSpace>::Space,
           Kokkos::HostSpace>::accessible,
       "");
@@ -265,8 +261,8 @@ TEST(cuda, space_access) {
 
 TEST(cuda, uvm) {
   if (Kokkos::CudaUVMSpace::available()) {
-    int *uvm_ptr = (int *)Kokkos::kokkos_malloc<Kokkos::CudaUVMSpace>(
-        "uvm_ptr", sizeof(int));
+    int *uvm_ptr = static_cast<int *>(
+        Kokkos::kokkos_malloc<Kokkos::CudaUVMSpace>("uvm_ptr", sizeof(int)));
 
     *uvm_ptr = 42;
 
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp
index 5dcbe566e299c0f013843216b0854dc51582dd6d..6d6ff0a67bc151421556fca487f30677a5119c33 100644
--- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp
@@ -59,17 +59,17 @@ TEST(TEST_CATEGORY, host_space_access) {
   using mirror_space =
       Kokkos::Impl::HostMirror<Kokkos::DefaultExecutionSpace>::Space;
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<host_exec_space,
-                                                 Kokkos::HostSpace>::accessible,
+  static_assert(Kokkos::SpaceAccessibility<host_exec_space,
+                                           Kokkos::HostSpace>::accessible,
                 "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<device_space,
-                                                 Kokkos::HostSpace>::accessible,
-                "");
+  static_assert(
+      Kokkos::SpaceAccessibility<device_space, Kokkos::HostSpace>::accessible,
+      "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<mirror_space,
-                                                 Kokkos::HostSpace>::accessible,
-                "");
+  static_assert(
+      Kokkos::SpaceAccessibility<mirror_space, Kokkos::HostSpace>::accessible,
+      "");
 }
 
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp
index bcd49e69bd3af022ede0ca0a188066288c9b1d35..c74090fff93c8b9a529fad2e5d156d4cad55b954 100644
--- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp
+++ b/packages/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp
@@ -54,12 +54,13 @@
 namespace Test {
 
 TEST(defaultdevicetype, malloc) {
-  int* data = (int*)Kokkos::kokkos_malloc(100 * sizeof(int));
-  ASSERT_NO_THROW(data = (int*)Kokkos::kokkos_realloc(data, 120 * sizeof(int)));
+  int* data = static_cast<int*>(Kokkos::kokkos_malloc(100 * sizeof(int)));
+  ASSERT_NO_THROW(data = static_cast<int*>(
+                      Kokkos::kokkos_realloc(data, 120 * sizeof(int))));
   Kokkos::kokkos_free(data);
 
-  int* data2 = (int*)Kokkos::kokkos_malloc(0);
-  ASSERT_TRUE(data2 == nullptr);
+  int* data2 = static_cast<int*>(Kokkos::kokkos_malloc(0));
+  ASSERT_EQ(data2, nullptr);
   Kokkos::kokkos_free(data2);
 }
 
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_a.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_a.cpp
deleted file mode 100644
index 02157836b3f6075c6c18e2919d93ed4b541dbab8..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewAPI_a.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_b.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_b.cpp
deleted file mode 100644
index 80e2fe3f93716c23979ede23aa81de9b2f694c9e..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewAPI_b.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_c.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_c.cpp
deleted file mode 100644
index 9694e33ca0ce0f5c2fc6214613f4ae2f03c9750d..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_c.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewAPI_c.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_d.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_d.cpp
deleted file mode 100644
index 0d773494ac6236ce0274cc844fb3369aec81d51d..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_d.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewAPI_d.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_e.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_e.cpp
deleted file mode 100644
index cbbbc810b0e8e588be2892b83279a4137675de66..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_e.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewAPI_e.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_a.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_a.cpp
deleted file mode 100644
index 444a3e6e95d2a62c1ad0e8bedba3767503dd4687..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewCopy_a.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_b.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_b.cpp
deleted file mode 100644
index f1f90e7acf13c7aaa4820f5bd50ecc403f2d6f5f..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewCopy_b.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_a.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_a.cpp
deleted file mode 100644
index 5e83121e341db1da440c65cd5dce84dc1a6f6259..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewMapping_a.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_b.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_b.cpp
deleted file mode 100644
index c024143d6c7b735dfa3b897e0a4503ee50e4caec..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewMapping_b.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_subview.cpp b/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_subview.cpp
deleted file mode 100644
index dcd6c1dc435982fdf44950c3b606847c29c30b37..0000000000000000000000000000000000000000
--- a/packages/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_subview.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewMapping_subview.hpp>
diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_AsyncLauncher.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_AsyncLauncher.cpp
index 0a243e0e8e89c0ef5a7cec6195837909d092bc2a..854f916ba3dad7777f453694ea708a0754872d3d 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIP_AsyncLauncher.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_AsyncLauncher.cpp
@@ -66,8 +66,8 @@ struct TestAsyncLauncher {
 
 TEST(hip, async_launcher) {
   size_t *flag;
-  HIP_SAFE_CALL(hipMalloc(&flag, sizeof(size_t)));
-  HIP_SAFE_CALL(hipMemset(flag, 0, sizeof(size_t)));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&flag, sizeof(size_t)));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMemset(flag, 0, sizeof(size_t)));
   // launch # of cycles * 1000 kernels w/ distinct values
   auto space        = Kokkos::Experimental::HIP();
   auto instance     = space.impl_internal_space_instance();
@@ -80,10 +80,10 @@ TEST(hip, async_launcher) {
   // the sum below should fail
   instance->fence();
   size_t h_flag;
-  HIP_SAFE_CALL(
+  KOKKOS_IMPL_HIP_SAFE_CALL(
       hipMemcpy(&h_flag, flag, sizeof(size_t), hipMemcpyHostToDevice));
   ASSERT_EQ(h_flag, (nkernels * (nkernels - 1)) / 2);
-  HIP_SAFE_CALL(hipFree(flag));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(flag));
 }
 
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_BlocksizeDeduction.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_BlocksizeDeduction.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f382e5b568b85cdac1f4943c72aa02be73472d7a
--- /dev/null
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_BlocksizeDeduction.cpp
@@ -0,0 +1,99 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <TestHIP_Category.hpp>
+
+namespace Test {
+
+struct TestNone {
+  Kokkos::View<size_t*, TEST_EXECSPACE> view;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const { view(i) = i; }
+
+  TestNone() { view = Kokkos::View<size_t*, TEST_EXECSPACE>("dummy", 1); }
+};
+
+struct TestSpiller {
+  Kokkos::View<size_t*, TEST_EXECSPACE> view;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    size_t array[1000] = {0};
+    // and update flag
+    size_t value = 0;
+    for (int ii = i; ii < 1000; ++ii) {
+      array[ii] = value;
+      value += ii;
+    }
+    for (int ii = i; ii < 1000; ++ii) {
+      value *= array[ii];
+    }
+    Kokkos::atomic_add(&view[0], value);
+  }
+
+  TestSpiller() { view = Kokkos::View<size_t*, TEST_EXECSPACE>("dummy", 1); }
+};
+
+TEST(hip, preferred_blocksize_deduction) {
+  using execution_space =
+      typename Kokkos::Impl::FunctorPolicyExecutionSpace<TestSpiller,
+                                                         void>::execution_space;
+  using policy = Kokkos::RangePolicy<execution_space>;
+
+  {
+    using DriverType = Kokkos::Impl::ParallelFor<TestNone, policy>;
+    ASSERT_TRUE(Kokkos::Experimental::Impl::HIPParallelLaunch<
+                    DriverType>::get_scratch_size() == 0);
+  }
+
+  {
+    using DriverType = Kokkos::Impl::ParallelFor<TestSpiller, policy>;
+    ASSERT_TRUE(Kokkos::Experimental::Impl::HIPParallelLaunch<
+                    DriverType>::get_scratch_size() > 0);
+  }
+}
+
+}  // namespace Test
diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp
index 3a76ca148cf683a83b84d351e4ebd8b2f7cdec94..73d08abca9d396464e8ba538e6e228c4ad70628b 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp
@@ -60,7 +60,7 @@ __global__ void offset(int* p) {
 // HIP.
 TEST(hip, raw_hip_interop) {
   int* p;
-  HIP_SAFE_CALL(hipMalloc(&p, sizeof(int) * 100));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&p, sizeof(int) * 100));
   Kokkos::InitArguments arguments{-1, -1, -1, false};
   Kokkos::initialize(arguments);
 
@@ -70,11 +70,12 @@ TEST(hip, raw_hip_interop) {
   Kokkos::finalize();
 
   offset<<<dim3(100), dim3(100), 0, nullptr>>>(p);
-  HIP_SAFE_CALL(hipDeviceSynchronize());
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize());
 
   std::array<int, 100> h_p;
-  HIP_SAFE_CALL(hipMemcpy(h_p.data(), p, sizeof(int) * 100, hipMemcpyDefault));
-  HIP_SAFE_CALL(hipDeviceSynchronize());
+  KOKKOS_IMPL_HIP_SAFE_CALL(
+      hipMemcpy(h_p.data(), p, sizeof(int) * 100, hipMemcpyDefault));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize());
   int64_t sum        = 0;
   int64_t sum_expect = 0;
   for (int i = 0; i < 100; i++) {
@@ -83,6 +84,6 @@ TEST(hip, raw_hip_interop) {
   }
 
   ASSERT_EQ(sum, sum_expect);
-  HIP_SAFE_CALL(hipFree(p));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(p));
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp
index 8e0880ddbd0b15524be75ab97b90044e5315a8ff..69ca62df6a3a3e95cc77fb4354b96eb6a16e0c2d 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp
@@ -51,11 +51,11 @@ namespace Test {
 // bound in HIP due to an error when computing the block size.
 TEST(hip, raw_hip_streams) {
   hipStream_t stream;
-  HIP_SAFE_CALL(hipStreamCreate(&stream));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream));
   Kokkos::InitArguments arguments{-1, -1, -1, false};
   Kokkos::initialize(arguments);
   int* p;
-  HIP_SAFE_CALL(hipMalloc(&p, sizeof(int) * 100));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&p, sizeof(int) * 100));
   using MemorySpace = typename TEST_EXECSPACE::memory_space;
 
   {
@@ -97,12 +97,13 @@ TEST(hip, raw_hip_streams) {
   }
   Kokkos::finalize();
   offset_streams<<<100, 64, 0, stream>>>(p);
-  HIP_SAFE_CALL(hipDeviceSynchronize());
-  HIP_SAFE_CALL(hipStreamDestroy(stream));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize());
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(stream));
 
   int h_p[100];
-  HIP_SAFE_CALL(hipMemcpy(h_p, p, sizeof(int) * 100, hipMemcpyDefault));
-  HIP_SAFE_CALL(hipDeviceSynchronize());
+  KOKKOS_IMPL_HIP_SAFE_CALL(
+      hipMemcpy(h_p, p, sizeof(int) * 100, hipMemcpyDefault));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize());
   int64_t sum        = 0;
   int64_t sum_expect = 0;
   for (int i = 0; i < 100; i++) {
diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp
index ae1de8ea2d304e41d672ff2e136d16c86cbb8068..d20ea877ec9e1f4aee9f0df5c1d807790cdc932e 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp
@@ -129,27 +129,26 @@ TEST(hip, space_access) {
 
   //--------------------------------------
 
+  static_assert(!Kokkos::SpaceAccessibility<Kokkos::Experimental::HIP,
+                                            Kokkos::HostSpace>::accessible,
+                "");
+
   static_assert(
-      !Kokkos::Impl::SpaceAccessibility<Kokkos::Experimental::HIP,
-                                        Kokkos::HostSpace>::accessible,
+      Kokkos::SpaceAccessibility<Kokkos::Experimental::HIP,
+                                 Kokkos::Experimental::HIPSpace>::accessible,
       "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
-                    Kokkos::Experimental::HIP,
-                    Kokkos::Experimental::HIPSpace>::accessible,
-                "");
-
-  static_assert(Kokkos::Impl::SpaceAccessibility<
+  static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::Experimental::HIP,
                     Kokkos::Experimental::HIPHostPinnedSpace>::accessible,
                 "");
 
   static_assert(
-      !Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, Kokkos::Experimental::HIPSpace>::accessible,
+      !Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                  Kokkos::Experimental::HIPSpace>::accessible,
       "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
+  static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::HostSpace,
                     Kokkos::Experimental::HIPHostPinnedSpace>::accessible,
                 "");
@@ -166,18 +165,18 @@ TEST(hip, space_access) {
                    Kokkos::Experimental::HIPHostPinnedSpace>::value,
       "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
+  static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::Impl::HostMirror<Kokkos::Experimental::HIP>::Space,
                     Kokkos::HostSpace>::accessible,
                 "");
 
   static_assert(
-      Kokkos::Impl::SpaceAccessibility<
+      Kokkos::SpaceAccessibility<
           Kokkos::Impl::HostMirror<Kokkos::Experimental::HIPSpace>::Space,
           Kokkos::HostSpace>::accessible,
       "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
+  static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::Impl::HostMirror<
                         Kokkos::Experimental::HIPHostPinnedSpace>::Space,
                     Kokkos::HostSpace>::accessible,
diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_TeamScratchStreams.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_TeamScratchStreams.cpp
index db360a99d3d60977cf06479e7662e21350dd5f99..86b2fab3c7e3fc0b53b309646add9f4817b804f2 100644
--- a/packages/kokkos/core/unit_test/hip/TestHIP_TeamScratchStreams.cpp
+++ b/packages/kokkos/core/unit_test/hip/TestHIP_TeamScratchStreams.cpp
@@ -104,7 +104,7 @@ void hip_stream_scratch_test(
   hipStream_t stream[4];
   Kokkos::Experimental::HIP hip[4];
   for (int i = 0; i < K; i++) {
-    HIP_SAFE_CALL(hipStreamCreate(&stream[i]));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream[i]));
     hip[i] = Kokkos::Experimental::HIP(stream[i]);
   }
 // Test that growing scratch size in subsequent calls doesn't crash things
@@ -131,7 +131,7 @@ void hip_stream_scratch_test(
   Kokkos::fence();
   for (int i = 0; i < K; i++) {
     hip[i] = Kokkos::Experimental::HIP();
-    HIP_SAFE_CALL(hipStreamDestroy(stream[i]));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(stream[i]));
   }
 }
 }  // namespace Impl
diff --git a/packages/kokkos/core/unit_test/incremental/Test01_execspace.hpp b/packages/kokkos/core/unit_test/incremental/Test01_execspace.hpp
index 419486d7a84673dacd48e7bf2513e054106bab4c..4d5ca46ba6ee6a41c8a9461bc5c26b36e5996a55 100644
--- a/packages/kokkos/core/unit_test/incremental/Test01_execspace.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test01_execspace.hpp
@@ -88,7 +88,7 @@ struct TestIncrExecSpace {
     ExecSpace().fence();
 
     auto concurrency = ExecSpace().concurrency();
-    ASSERT_TRUE(concurrency > 0);
+    ASSERT_GT(concurrency, 0);
 
     int in_parallel = ExecSpace::in_parallel();
     ASSERT_FALSE(in_parallel);
@@ -107,5 +107,7 @@ TEST(TEST_CATEGORY, IncrTest_01_execspace) {
   ASSERT_TRUE(Kokkos::is_execution_space<TEST_EXECSPACE>::value);
   ASSERT_FALSE(Kokkos::is_execution_space<
                TestIncrExecSpaceTypedef<TEST_EXECSPACE>>::value);
+  TestIncrExecSpace<TEST_EXECSPACE> test;
+  test.testit();
 }
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/incremental/Test02_atomic_host.hpp b/packages/kokkos/core/unit_test/incremental/Test02_atomic_host.hpp
index ff4fb6a89f4d380d0693e8697e27fbf5bde2f4d0..d40cb4dbe7f77627429f9880ce4981b38f414c9d 100644
--- a/packages/kokkos/core/unit_test/incremental/Test02_atomic_host.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test02_atomic_host.hpp
@@ -78,7 +78,7 @@ struct TestIncrAtomic {
   }
 };
 
-TEST(TEST_CATEGORY, IncrTest_01_AtomicExchange) {
+TEST(TEST_CATEGORY, IncrTest_02_AtomicExchange) {
   TestIncrAtomic test;
   test.testExchange();
 }
diff --git a/packages/kokkos/core/unit_test/incremental/Test06_ParallelFor_MDRangePolicy.hpp b/packages/kokkos/core/unit_test/incremental/Test06_ParallelFor_MDRangePolicy.hpp
index 4adf9e058fd5b1a85b3f7e24cac530876b7251f3..4192d4abe865f10a43e9a87ed6ee4aa877974dc0 100644
--- a/packages/kokkos/core/unit_test/incremental/Test06_ParallelFor_MDRangePolicy.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test06_ParallelFor_MDRangePolicy.hpp
@@ -94,16 +94,16 @@ struct TestMDRangePolicy {
   using int_index = Kokkos::IndexType<int>;
 
   // An MDRangePolicy for 2 nested loops
-  using MDPolicyType_2D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<2>, int_index>;
+  using MDPolicyType_2D =
+      typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, int_index>;
 
   // An MDRangePolicy for 3 nested loops
-  using MDPolicyType_3D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<3>, int_index>;
+  using MDPolicyType_3D =
+      typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>, int_index>;
 
   // An MDRangePolicy for 4 nested loops
-  using MDPolicyType_4D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<4>, int_index>;
+  using MDPolicyType_4D =
+      typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<4>, int_index>;
 
   // Device and Host Data structure pointer
   value_type *deviceData, *hostData;
diff --git a/packages/kokkos/core/unit_test/incremental/Test08_deep_copy.hpp b/packages/kokkos/core/unit_test/incremental/Test08_deep_copy.hpp
index 5166f5a9f0de05b24166161654c9eaab4ff2ad82..6e8fc07b8de50e61e1139ad0b7ba6e2752e81229 100644
--- a/packages/kokkos/core/unit_test/incremental/Test08_deep_copy.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test08_deep_copy.hpp
@@ -61,17 +61,17 @@ const int M      = 10;
 template <class ExecSpace>
 struct TestMDRangePolicy {
   // 2D View
-  using View_2D      = typename Kokkos::View<value_type **, ExecSpace>;
+  using View_2D      = Kokkos::View<value_type **, ExecSpace>;
   using Host_View_2D = typename View_2D::HostMirror;
   Host_View_2D hostDataView_2D;
 
   // 3D View
-  using View_3D      = typename Kokkos::View<value_type ***, ExecSpace>;
+  using View_3D      = Kokkos::View<value_type ***, ExecSpace>;
   using Host_View_3D = typename View_3D::HostMirror;
   Host_View_3D hostDataView_3D;
 
   // 4D View
-  using View_4D      = typename Kokkos::View<value_type ****, ExecSpace>;
+  using View_4D      = Kokkos::View<value_type ****, ExecSpace>;
   using Host_View_4D = typename View_4D::HostMirror;
   Host_View_4D hostDataView_4D;
 
@@ -83,16 +83,16 @@ struct TestMDRangePolicy {
   using int_index = Kokkos::IndexType<int>;
 
   // An MDRangePolicy for 2 nested loops
-  using MDPolicyType_2D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<2>, int_index>;
+  using MDPolicyType_2D =
+      Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, int_index>;
 
   // An MDRangePolicy for 3 nested loops
-  using MDPolicyType_3D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<3>, int_index>;
+  using MDPolicyType_3D =
+      Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>, int_index>;
 
   // An MDRangePolicy for 4 nested loops
-  using MDPolicyType_4D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<4>, int_index>;
+  using MDPolicyType_4D =
+      Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<4>, int_index>;
 
   // compare and equal
   void compare_equal_2D() {
diff --git a/packages/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp b/packages/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp
index 5bf1860d8e4a6bcf739656bdc7e1f790ebf60512..ab1cd90d4bfa2e5f5800b8dd13f23a999d2a8fa0 100644
--- a/packages/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp
@@ -74,9 +74,15 @@ struct ThreadScratch {
     for (int i = 0; i < sY; ++i) v_S(i) = 0;
 
     Kokkos::parallel_for(Kokkos::TeamThreadRange(team, sX), [&](const int m) {
+    // FIXME_SYCL This deadlocks in the subgroup_barrier when running on CUDA
+    // devices.
+#ifdef KOKKOS_ENABLE_SYCL
+      for (int k = 0; k < sY; ++k) v_S(k) += sX * sY * n + sY * m + k;
+#else
       Kokkos::parallel_for(
           Kokkos::ThreadVectorRange(team, sY),
           [&](const int k) { v_S(k) += sX * sY * n + sY * m + k; });
+#endif
     });
 
     team.team_barrier();
@@ -93,7 +99,7 @@ struct ThreadScratch {
     int scratchSize = scratch_t::shmem_size(sY);
     // So this works with deprecated code enabled:
     policy_t policy =
-        policy_t(pN, Kokkos::AUTO)
+        policy_t(pN, Kokkos::AUTO, 1)
             .set_scratch_size(scratch_level, Kokkos::PerThread(scratchSize));
 
     int max_team_size = policy.team_size_max(*this, Kokkos::ParallelForTag());
diff --git a/packages/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp b/packages/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp
index b34f652e76d919f14c3afed0656b8bcd86dbc27f..d81822d0da9cfd67b25c8394886509407ecbfeb0 100644
--- a/packages/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp
@@ -68,7 +68,7 @@ struct TeamScratch {
 
     Kokkos::parallel_for(
         "Team",
-        policy_t(pN, Kokkos::AUTO)
+        policy_t(pN, Kokkos::AUTO, 1)
             .set_scratch_size(scratch_level, Kokkos::PerTeam(scratchSize)),
         KOKKOS_LAMBDA(const team_t &team) {
           // Allocate and use scratch pad memory
@@ -77,11 +77,20 @@ struct TeamScratch {
 
           Kokkos::parallel_for(
               Kokkos::TeamThreadRange(team, sX), [&](const int m) {
+      // FIXME_SYCL This deadlocks in the subgroup_barrier
+      // when running on CUDA devices.
+#ifdef KOKKOS_ENABLE_SYCL
+                for (int k = 0; k < sY; ++k) {
+                  v_S(m, k) =
+                      v_S.extent(0) * v_S.extent(1) * n + v_S.extent(1) * m + k;
+                }
+#else
                 Kokkos::parallel_for(
                     Kokkos::ThreadVectorRange(team, sY), [&](const int k) {
                       v_S(m, k) = v_S.extent(0) * v_S.extent(1) * n +
                                   v_S.extent(1) * m + k;
                     });
+#endif
               });
 
           team.team_barrier();
diff --git a/packages/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp b/packages/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp
index d227e834dc64607c4ca01127228527dc71e9e918..7d53b9fb208ea490626634c304dd130b74392461 100644
--- a/packages/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp
+++ b/packages/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp
@@ -82,20 +82,20 @@ struct MyComplex {
 template <class ExecSpace>
 struct TestMDRangeReduce {
   // 1D  View of double
-  using View_1D = typename Kokkos::View<value_type*, ExecSpace>;
+  using View_1D = Kokkos::View<value_type*, ExecSpace>;
 
   // 2D  View of double
-  using View_2D = typename Kokkos::View<value_type**, ExecSpace>;
+  using View_2D = Kokkos::View<value_type**, ExecSpace>;
 
   // Index Type for the iterator
   using int_index = Kokkos::IndexType<int>;
 
   // An MDRangePolicy for 2 nested loops
-  using MDPolicyType_2D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<2>, int_index>;
+  using MDPolicyType_2D =
+      Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, int_index>;
 
   //  1D - complex View
-  using Complex_View_1D = typename Kokkos::View<MyComplex*, ExecSpace>;
+  using Complex_View_1D = Kokkos::View<MyComplex*, ExecSpace>;
 
   // Reduction when ExecPolicy = MDRangePolicy and ReducerArgument =
   // scalar/1-element view
@@ -176,7 +176,11 @@ struct TestMDRangeReduce {
 TEST(TEST_CATEGORY, incr_14_MDrangeReduce) {
   TestMDRangeReduce<TEST_EXECSPACE> test;
   test.reduce_MDRange();
+// FIXME_OPENMPTARGET: custom reductions are not yet supported in the
+// OpenMPTarget backend.
+#if !defined(KOKKOS_ENABLE_OPENMPTARGET)
   test.reduce_custom();
+#endif
 }
 
 }  // namespace Test
diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp
index 018855963d35f8fef81a93985811dcc3d9b239fc..d145d69d9e0feb4450bfff5080e9955115b5c49e 100644
--- a/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp
+++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp
@@ -52,13 +52,16 @@ namespace Test {
 // Test whether allocations survive Kokkos initialize/finalize if done via Raw
 // SYCL.
 TEST(sycl, raw_sycl_interop) {
+  Kokkos::InitArguments arguments{-1, -1, -1, false};
+  Kokkos::initialize(arguments);
+
+  Kokkos::Experimental::SYCL default_space;
+  sycl::context default_context = default_space.sycl_context();
+
   sycl::default_selector device_selector;
-  sycl::queue queue(device_selector);
+  sycl::queue queue(default_context, device_selector);
   constexpr int n = 100;
   int* p          = sycl::malloc_device<int>(n, queue);
-
-  Kokkos::InitArguments arguments{-1, -1, -1, false};
-  Kokkos::initialize(arguments);
   {
     TEST_EXECSPACE space(queue);
     Kokkos::View<int*, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v(p, n);
diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_Spaces.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_Spaces.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..91fdaac6e097fb9b127816301ca2e5c514a4f374
--- /dev/null
+++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_Spaces.cpp
@@ -0,0 +1,356 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <TestSYCL_Category.hpp>
+
+namespace Test {
+
+TEST(sycl, space_access) {
+  static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
+                                                Kokkos::HostSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::assignable,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::accessible,
+                "");
+
+  //--------------------------------------
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLDeviceUSMSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLDeviceUSMSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLDeviceUSMSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLDeviceUSMSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
+                "");
+
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                                       Kokkos::HostSpace>::assignable,
+      "");
+
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                                       Kokkos::HostSpace>::accessible,
+      "");
+
+  //--------------------------------------
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLSharedUSMSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLSharedUSMSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLSharedUSMSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
+                "");
+
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace,
+                                       Kokkos::HostSpace>::assignable,
+      "");
+
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace,
+                                       Kokkos::HostSpace>::accessible,
+      "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLSharedUSMSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLSharedUSMSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
+                "");
+
+  //--------------------------------------
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLHostUSMSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::assignable,
+                "");
+
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
+                                       Kokkos::HostSpace>::assignable,
+      "");
+
+  static_assert(
+      Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
+                                      Kokkos::HostSpace>::accessible,
+      "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLHostUSMSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLHostUSMSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLHostUSMSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLHostUSMSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::accessible,
+                "");
+
+  //--------------------------------------
+
+  static_assert(!Kokkos::SpaceAccessibility<Kokkos::Experimental::SYCL,
+                                            Kokkos::HostSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Experimental::SYCL,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Experimental::SYCL,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Experimental::SYCL,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
+                "");
+
+  static_assert(!Kokkos::SpaceAccessibility<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
+                "");
+
+  static_assert(
+      std::is_same<Kokkos::Impl::HostMirror<
+                       Kokkos::Experimental::SYCLDeviceUSMSpace>::Space,
+                   Kokkos::HostSpace>::value,
+      "");
+
+  static_assert(
+      std::is_same<
+          Kokkos::Impl::HostMirror<
+              Kokkos::Experimental::SYCLSharedUSMSpace>::Space,
+          Kokkos::Device<Kokkos::HostSpace::execution_space,
+                         Kokkos::Experimental::SYCLSharedUSMSpace>>::value,
+      "");
+
+  static_assert(
+      Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
+                                      Kokkos::HostSpace>::accessible,
+      "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
+                "");
+
+  static_assert(std::is_same<Kokkos::Impl::HostMirror<
+                                 Kokkos::Experimental::SYCLHostUSMSpace>::Space,
+                             Kokkos::Experimental::SYCLHostUSMSpace>::value,
+                "");
+
+  static_assert(
+      std::is_same<
+          Kokkos::Device<Kokkos::HostSpace::execution_space,
+                         Kokkos::Experimental::SYCLSharedUSMSpace>,
+          Kokkos::Device<Kokkos::HostSpace::execution_space,
+                         Kokkos::Experimental::SYCLSharedUSMSpace>>::value,
+      "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Impl::HostMirror<Kokkos::Experimental::SYCL>::Space,
+                    Kokkos::HostSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Impl::HostMirror<
+                        Kokkos::Experimental::SYCLDeviceUSMSpace>::Space,
+                    Kokkos::HostSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Impl::HostMirror<
+                        Kokkos::Experimental::SYCLSharedUSMSpace>::Space,
+                    Kokkos::HostSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Impl::HostMirror<
+                        Kokkos::Experimental::SYCLHostUSMSpace>::Space,
+                    Kokkos::HostSpace>::accessible,
+                "");
+}
+
+TEST(sycl, uvm) {
+  int *uvm_ptr = static_cast<int *>(
+      Kokkos::kokkos_malloc<Kokkos::Experimental::SYCLSharedUSMSpace>(
+          "uvm_ptr", sizeof(int)));
+
+  *uvm_ptr = 42;
+
+  Kokkos::Experimental::SYCL().fence();
+  Kokkos::parallel_for(
+      Kokkos::RangePolicy<Kokkos::Experimental::SYCL>(0, 1),
+      KOKKOS_LAMBDA(int) {
+        if (*uvm_ptr == 42) {
+          *uvm_ptr = 2 * 42;
+        }
+      });
+  Kokkos::Experimental::SYCL().fence();
+
+  EXPECT_EQ(*uvm_ptr, int(2 * 42));
+
+  Kokkos::kokkos_free<Kokkos::Experimental::SYCLSharedUSMSpace>(uvm_ptr);
+}
+
+template <class MemSpace, class ExecSpace>
+struct TestViewSYCLAccessible {
+  enum { N = 1000 };
+
+  using V = Kokkos::View<double *, MemSpace>;
+
+  V m_base;
+
+  struct TagInit {};
+  struct TagTest {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TagInit &, const int i) const { m_base[i] = i + 1; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TagTest &, const int i, long &error_count) const {
+    if (m_base[i] != i + 1) ++error_count;
+  }
+
+  TestViewSYCLAccessible() : m_base("base", N) {}
+
+  static void run() {
+    TestViewSYCLAccessible self;
+    Kokkos::parallel_for(
+        Kokkos::RangePolicy<typename MemSpace::execution_space, TagInit>(0, N),
+        self);
+    typename MemSpace::execution_space().fence();
+
+    // Next access is a different execution space, must complete prior kernel.
+    long error_count = -1;
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace, TagTest>(0, N), self,
+                            error_count);
+    EXPECT_EQ(error_count, 0);
+  }
+};
+
+TEST(sycl, impl_view_accessible) {
+  TestViewSYCLAccessible<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                         Kokkos::Experimental::SYCL>::run();
+
+  TestViewSYCLAccessible<Kokkos::Experimental::SYCLSharedUSMSpace,
+                         Kokkos::Experimental::SYCL>::run();
+  TestViewSYCLAccessible<Kokkos::Experimental::SYCLSharedUSMSpace,
+                         Kokkos::HostSpace::execution_space>::run();
+
+  TestViewSYCLAccessible<Kokkos::Experimental::SYCLHostUSMSpace,
+                         Kokkos::Experimental::SYCL>::run();
+  TestViewSYCLAccessible<Kokkos::Experimental::SYCLHostUSMSpace,
+                         Kokkos::HostSpace::execution_space>::run();
+}
+
+}  // namespace Test
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_SharedAlloc.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp
similarity index 96%
rename from packages/kokkos/core/unit_test/cuda/TestCudaUVM_SharedAlloc.cpp
rename to packages/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp
index 6602d7396a7c2fdec7e16e83079764962dbeab75..95a7b68088c1238672b2257d285d7329d52cbec7 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaUVM_SharedAlloc.cpp
+++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp
@@ -1,3 +1,4 @@
+
 /*
 //@HEADER
 // ************************************************************************
@@ -42,5 +43,5 @@
 //@HEADER
 */
 
-#include <TestCudaUVM_Category.hpp>
-#include <TestSharedAlloc.hpp>
+#include <TestSYCL_Category.hpp>
+#include <TestTaskScheduler.hpp>
diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab0d09880f03b56d81ae693d26b5c838b2436a24
--- /dev/null
+++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp
@@ -0,0 +1,154 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <TestSYCL_Category.hpp>
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+
+namespace Impl {
+
+struct SYCLQueueScratchTestFunctor {
+  using team_t = Kokkos::TeamPolicy<Kokkos::Experimental::SYCL>::member_type;
+  using scratch_t =
+      Kokkos::View<int64_t*, Kokkos::Experimental::SYCL::scratch_memory_space>;
+
+  Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace,
+               Kokkos::MemoryTraits<Kokkos::Atomic>>
+      counter;
+  int N, M;
+  SYCLQueueScratchTestFunctor(
+      Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace> counter_,
+      int N_, int M_)
+      : counter(counter_), N(N_), M(M_) {}
+
+  KOKKOS_FUNCTION
+  void operator()(const team_t& team) const {
+    scratch_t scr(team.team_scratch(1), M);
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, M),
+                         [&](int i) { scr[i] = 0; });
+    team.team_barrier();
+    for (int i = 0; i < N; i++) {
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, M),
+                           [&](int j) { scr[j] += 1; });
+    }
+    team.team_barrier();
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, M), [&](int i) {
+      if (scr[i] != N) counter()++;
+    });
+  }
+};
+
+void sycl_queue_scratch_test_one(
+    int N, int T, int M_base,
+    Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace> counter,
+    Kokkos::Experimental::SYCL sycl, int tid) {
+  int M = M_base + tid * 5;
+  Kokkos::TeamPolicy<Kokkos::Experimental::SYCL> p(sycl, T, 64);
+  using scratch_t =
+      Kokkos::View<int64_t*, Kokkos::Experimental::SYCL::scratch_memory_space>;
+
+  int bytes = scratch_t::shmem_size(M);
+
+  for (int r = 0; r < 15; r++) {
+    Kokkos::parallel_for("Run", p.set_scratch_size(1, Kokkos::PerTeam(bytes)),
+                         SYCLQueueScratchTestFunctor(counter, N, M));
+  }
+}
+
+void sycl_queue_scratch_test(
+    int N, int T, int M_base,
+    Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace> counter) {
+  constexpr int K = 4;
+  Kokkos::Experimental::SYCL default_space;
+  sycl::context default_context = default_space.sycl_context();
+
+  sycl::default_selector device_selector;
+  sycl::queue queue(default_context, device_selector);
+
+  std::array<Kokkos::Experimental::SYCL, K> sycl;
+  for (int i = 0; i < K; i++) {
+    sycl[i] = Kokkos::Experimental::SYCL(
+        sycl::queue(default_context, device_selector));
+  }
+
+  // Test that growing scratch size in subsequent calls doesn't crash things
+#if defined(KOKKOS_ENABLE_OPENMP)
+#pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    // Limit how many threads submit
+    if (tid < 4) {
+      sycl_queue_scratch_test_one(N, T, M_base, counter, sycl[tid], tid);
+    }
+  }
+#else
+  for (int tid = 0; tid < K; tid++) {
+    sycl_queue_scratch_test_one(N, T, M_base, counter, sycl[tid], tid);
+  }
+#endif
+  // Test that if everything is large enough, multiple launches with different
+  // scratch sizes don't step on each other
+  for (int tid = K - 1; tid >= 0; tid--) {
+    sycl_queue_scratch_test_one(N, T, M_base, counter, sycl[tid], tid);
+  }
+
+  Kokkos::fence();
+}
+}  // namespace Impl
+
+TEST(sycl, team_scratch_1_queues) {
+  int N      = 1000000;
+  int T      = 10;
+  int M_base = 150;
+
+  Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace> counter("C");
+
+  Impl::sycl_queue_scratch_test(N, T, M_base, counter);
+
+  int64_t result;
+  Kokkos::deep_copy(result, counter);
+  ASSERT_EQ(0, result);
+}
+}  // namespace Test
diff --git a/packages/kokkos/core/unit_test/tools/TestBuiltinTuners.cpp b/packages/kokkos/core/unit_test/tools/TestBuiltinTuners.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..870621c1e0d3e4530573fc70ee24208b3b5a7911
--- /dev/null
+++ b/packages/kokkos/core/unit_test/tools/TestBuiltinTuners.cpp
@@ -0,0 +1,123 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#include <Kokkos_Core.hpp>
+using ExecSpace  = Kokkos::DefaultHostExecutionSpace;
+using TeamMember = Kokkos::TeamPolicy<ExecSpace>::member_type;
+struct TestTeamFunctor {
+  KOKKOS_FUNCTION void operator()(TeamMember) const {}
+};
+struct TestMDFunctor {
+  KOKKOS_FUNCTION void operator()(const int, const int) const {}
+};
+int main(int argc, char* argv[]) {
+  Kokkos::initialize(argc, argv);
+  {
+    Kokkos::TeamPolicy<ExecSpace> teamp(1, Kokkos::AUTO, Kokkos::AUTO);
+    Kokkos::MDRangePolicy<Kokkos::Rank<2>> mdp({0, 0}, {1, 1});
+    Kokkos::Tools::Experimental::TeamSizeTuner team_tune_this(
+        "team_tuner", teamp, TestTeamFunctor{}, Kokkos::ParallelForTag{},
+        Kokkos::Tools::Impl::Impl::SimpleTeamSizeCalculator{});
+
+    Kokkos::Tools::Experimental::MDRangeTuner<2> md_tune_this(
+        "md_tuner", mdp, TestMDFunctor{}, Kokkos::ParallelForTag{},
+        Kokkos::Tools::Impl::Impl::SimpleTeamSizeCalculator{});
+
+    std::vector<int> options{1, 2, 3, 4, 5};
+
+    auto new_team_tuner = team_tune_this.combine("options", options);
+    auto new_md_tuner   = md_tune_this.combine("options", options);
+    using namespace Kokkos::Tools::Experimental;
+    VariableInfo info;
+    info.category      = StatisticalCategory::kokkos_value_categorical;
+    info.valueQuantity = CandidateValueType::kokkos_value_unbounded;
+    info.type          = ValueType::kokkos_value_string;
+    size_t input       = declare_input_type("kernel", info);
+    VariableValue team_kernel_value = make_variable_value(input, "abs");
+    VariableValue md_kernel_value   = make_variable_value(input, "abs");
+    size_t kernel_context           = get_new_context_id();
+    begin_context(kernel_context);
+    set_input_values(kernel_context, 1, &team_kernel_value);
+    for (int x = 0; x < 10000; ++x) {
+      auto config = new_md_tuner.begin();
+      int option  = std::get<0>(config);
+      (void)option;
+      int tile_x = std::get<1>(config);
+      int tile_y = std::get<2>(config);
+      Kokkos::parallel_for("mdrange",
+                           Kokkos::MDRangePolicy<Kokkos::Rank<2>>(
+                               {0, 0}, {1, 1}, {tile_x, tile_y}),
+                           TestMDFunctor{});
+      new_md_tuner.end();
+    }
+    end_context(kernel_context);
+    begin_context(kernel_context);
+    set_input_values(kernel_context, 1, &md_kernel_value);
+
+    /**
+     * Note that 0.0 is basically a floating point index into
+     * the outermost index in this, which is the options vector
+     * above. The At 0.0, this will be the first element (1).
+     * At 0.9 this will be the last element (5)
+     */
+    auto begin_point = new_team_tuner.get_point(0.0, 0.0, 0.0);
+    assert(std::get<0>(begin_point) == 1);
+    (void)begin_point;  // to avoid warnings in some compilers
+    auto end_point = new_team_tuner.get_point(0.9, 0.0, 0.0);
+    (void)end_point;  // to avoid warnings in some compilers
+    assert(std::get<0>(end_point) == 5);
+    for (int x = 0; x < 10000; ++x) {
+      auto config = new_team_tuner.begin();
+      int option  = std::get<0>(config);
+      (void)option;
+      int team   = std::get<1>(config);
+      int vector = std::get<2>(config);
+      Kokkos::parallel_for("mdrange",
+                           Kokkos::TeamPolicy<ExecSpace>(1, team, vector),
+                           TestTeamFunctor{});
+      new_team_tuner.end();
+    }
+    end_context(kernel_context);
+  }
+  Kokkos::finalize();
+}
diff --git a/packages/kokkos/core/unit_test/tools/TestCategoricalTuner.cpp b/packages/kokkos/core/unit_test/tools/TestCategoricalTuner.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2177556d392f002ad17499ad08fe93268a3c7937
--- /dev/null
+++ b/packages/kokkos/core/unit_test/tools/TestCategoricalTuner.cpp
@@ -0,0 +1,86 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+// This file tests the categorical tuner
+
+#include <Kokkos_Core.hpp>
+#include <unistd.h>
+struct point {
+  float x;
+  float y;
+  float z;
+};
+void do_computation(const point& test_point) {
+  usleep(((unsigned int)test_point.x) * 100);
+}
+using namespace Kokkos::Tools::Experimental;
+int main(int argc, char* argv[]) {
+  Kokkos::initialize(argc, argv);
+  {
+    VariableInfo info;
+    info.category              = StatisticalCategory::kokkos_value_categorical;
+    info.valueQuantity         = CandidateValueType::kokkos_value_unbounded;
+    info.type                  = ValueType::kokkos_value_string;
+    size_t input               = declare_input_type("kernel", info);
+    VariableValue kernel_value = make_variable_value(input, "abs");
+    size_t kernel_context      = get_new_context_id();
+    begin_context(kernel_context);
+    set_input_values(kernel_context, 1, &kernel_value);
+
+    std::vector<point> points;
+    points.push_back({1.0, 1.0, 1.0});
+    points.push_back({10.0, 10.0, 10.0});
+    points.push_back({0.0, 0.0, 0.0});
+    auto tuner =
+        Kokkos::Tools::Experimental::make_categorical_tuner("points", points);
+    for (decltype(points)::size_type x = 0; x < 3000; ++x) {
+      point test_point = tuner.begin();
+      do_computation(test_point);
+      tuner.end();
+    }
+
+    end_context(kernel_context);
+  }
+  Kokkos::finalize();
+}
diff --git a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_c.cpp b/packages/kokkos/core/unit_test/tools/TestEventCorrectness.cpp
similarity index 94%
rename from packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_c.cpp
rename to packages/kokkos/core/unit_test/tools/TestEventCorrectness.cpp
index 26dc9b0e000096ab1809412c4a29fc563844cbd1..ac0b4d26196351c6654c9b7996931784e4fa2653 100644
--- a/packages/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_c.cpp
+++ b/packages/kokkos/core/unit_test/tools/TestEventCorrectness.cpp
@@ -42,5 +42,8 @@
 //@HEADER
 */
 
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewAPI_c.hpp>
+#include <iostream>
+#include "Kokkos_Core.hpp"
+
+#include <tools/TestEventCorrectness.hpp>
+#include "../UnitTestMainInit.cpp"
diff --git a/packages/kokkos/core/unit_test/tools/TestEventCorrectness.hpp b/packages/kokkos/core/unit_test/tools/TestEventCorrectness.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..430677a335df32737a08520467cd26513f2e83e7
--- /dev/null
+++ b/packages/kokkos/core/unit_test/tools/TestEventCorrectness.hpp
@@ -0,0 +1,284 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#include <iostream>
+#include <gtest/gtest.h>
+#include "Kokkos_Core.hpp"
+
+#include <impl/Kokkos_Stacktrace.hpp>
+#include <vector>
+#include <algorithm>
+namespace Kokkos {
+class Serial;
+class OpenMP;
+class Cuda;
+class Threads;
+namespace Experimental {
+class SYCL;
+class HIP;
+class OpenMPTarget;
+class HPX;
+}  // namespace Experimental
+}  // namespace Kokkos
+namespace Test {
+struct FencePayload {
+  std::string name;
+  enum distinguishable_devices { yes, no };
+  distinguishable_devices distinguishable;
+  uint32_t dev_id;
+};
+
+std::vector<FencePayload> found_payloads;
+template <typename Lambda>
+void expect_fence_events(std::vector<FencePayload>& expected, Lambda lam) {
+  found_payloads = {};
+  Kokkos::Tools::Experimental::set_begin_fence_callback(
+      [](const char* name, const uint32_t dev_id, uint64_t*) {
+        found_payloads.push_back(
+            FencePayload{std::string(name),
+                         FencePayload::distinguishable_devices::no, dev_id});
+      });
+  Kokkos::Tools::Experimental::set_begin_parallel_for_callback(
+      [](const char* name, const uint32_t dev_id, uint64_t*) {
+        found_payloads.push_back(
+            FencePayload{std::string(name),
+                         FencePayload::distinguishable_devices::no, dev_id});
+      });
+  lam();
+  for (auto& entry : expected) {
+    std::cout << "Ref: " << entry.dev_id << std::endl;
+    std::cout << "Ref: " << entry.name << std::endl;
+    auto search = std::find_if(
+        found_payloads.begin(), found_payloads.end(),
+        [&](const auto& found_entry) {
+          auto name_match =
+              (found_entry.name.find(entry.name) != std::string::npos);
+          auto id_match = (entry.dev_id == found_entry.dev_id);
+          std::cout << found_entry.dev_id << std::endl;
+          std::cout << found_entry.name << std::endl;
+          if (!name_match) {
+            std::cout << "Miss on name\n";
+          }
+          if (!id_match) {
+            std::cout << "Miss on id\n";
+          }
+          return (name_match && id_match);
+        });
+    auto found = (search != found_payloads.end());
+    ASSERT_TRUE(found);
+  }
+  Kokkos::Tools::Experimental::set_begin_fence_callback(
+      [](const char*, const uint32_t, uint64_t*) {});
+  Kokkos::Tools::Experimental::set_begin_parallel_for_callback(
+      [](const char*, const uint32_t, uint64_t*) {});
+}
+
+template <class>
+struct increment {
+  constexpr static const int size = 0;
+};
+int num_instances = 1;
+struct TestFunctor {
+  KOKKOS_FUNCTION void operator()(const int) const {}
+};
+template <typename Lambda>
+void test_wrapper(const Lambda& lambda) {
+  if (!std::is_same<Kokkos::DefaultExecutionSpace, Kokkos::Serial>::value) {
+    lambda();
+  }
+}
+/**
+ * Test that fencing an instance with a name yields a fence
+ * event of that name, and the correct device ID
+ */
+TEST(defaultdevicetype, test_named_instance_fence) {
+  test_wrapper([&]() {
+    auto root = Kokkos::Tools::Experimental::device_id_root<
+        Kokkos::DefaultExecutionSpace>();
+    std::vector<FencePayload> expected{
+
+        {"named_instance", FencePayload::distinguishable_devices::no,
+         root + num_instances}};
+    expect_fence_events(expected, [=]() {
+      Kokkos::DefaultExecutionSpace ex;
+      ex.fence("named_instance");
+    });
+    num_instances += increment<Kokkos::DefaultExecutionSpace>::size;
+  });
+}
+/**
+ * Test that fencing an instance without a name yields a fence
+ * event of a correct name, and the correct device ID
+ */
+TEST(defaultdevicetype, test_unnamed_instance_fence) {
+  test_wrapper([&]() {
+    auto root = Kokkos::Tools::Experimental::device_id_root<
+        Kokkos::DefaultExecutionSpace>();
+    std::vector<FencePayload> expected{
+
+        {"Unnamed Instance Fence", FencePayload::distinguishable_devices::no,
+         root + num_instances}};
+    expect_fence_events(expected, [=]() {
+      Kokkos::DefaultExecutionSpace ex;
+      ex.fence();
+    });
+    num_instances += increment<Kokkos::DefaultExecutionSpace>::size;
+  });
+}
+
+/**
+ * Test that invoking a global fence with a name yields a fence
+ * event of a correct name, and fences the root of the default device
+ */
+TEST(defaultdevicetype, test_named_global_fence) {
+  test_wrapper([&]() {
+    auto root = Kokkos::Tools::Experimental::device_id_root<
+        Kokkos::DefaultExecutionSpace>();
+
+    std::vector<FencePayload> expected{
+
+        {"test global fence", FencePayload::distinguishable_devices::no, root}};
+    expect_fence_events(expected,
+                        [=]() { Kokkos::fence("test global fence"); });
+  });
+}
+
+/**
+ * Test that invoking a global fence with no name yields a fence
+ * event of a correct name, and fences the root of the default device
+ */
+TEST(defaultdevicetype, test_unnamed_global_fence) {
+  test_wrapper([&]() {
+    auto root = Kokkos::Tools::Experimental::device_id_root<
+        Kokkos::DefaultExecutionSpace>();
+
+    std::vector<FencePayload> expected{
+
+        {"Unnamed Global Fence", FencePayload::distinguishable_devices::no,
+         root}};
+    expect_fence_events(expected, [=]() { Kokkos::fence(); });
+    num_instances += increment<Kokkos::DefaultExecutionSpace>::size;
+  });
+}
+/**
+ * Test that creating two default instances and fencing both yields
+ * fence on the same device ID, as these should yield the same instance
+ */
+TEST(defaultdevicetype, test_multiple_default_instances) {
+  test_wrapper([&]() {
+    std::vector<FencePayload> expected{};
+    expect_fence_events(expected, [=]() {
+      Kokkos::DefaultExecutionSpace ex1;
+      Kokkos::DefaultExecutionSpace ex2;
+      ex1.fence("named_instance_fence_one");
+      ex2.fence("named_instance_fence_two");
+    });
+    ASSERT_TRUE(found_payloads[0].dev_id == found_payloads[1].dev_id);
+  });
+}
+
+/**
+ * Test that fencing and kernels yield events on the correct device ID's
+ */
+TEST(defaultdevicetype, test_kernel_sequence) {
+  test_wrapper([&]() {
+    auto root = Kokkos::Tools::Experimental::device_id_root<
+        Kokkos::DefaultExecutionSpace>();
+    std::vector<FencePayload> expected{
+
+        {"named_instance", FencePayload::distinguishable_devices::no,
+         root + num_instances},
+        {"test_kernel", FencePayload::distinguishable_devices::no,
+         root + num_instances}
+
+    };
+    expect_fence_events(expected, [=]() {
+      Kokkos::DefaultExecutionSpace ex;
+      TestFunctor tf;
+      ex.fence("named_instance");
+      Kokkos::parallel_for(
+          "test_kernel",
+          Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(ex, 0, 1), tf);
+    });
+    num_instances += increment<Kokkos::DefaultExecutionSpace>::size;
+  });
+}
+#ifdef KOKKOS_ENABLE_CUDA
+/**
+ * CUDA ONLY: test that creating instances from streams leads to events
+ * on different device ID's
+ */
+TEST(defaultdevicetype, test_streams) {
+  test_wrapper([&]() {
+    // auto root = Kokkos::Tools::Experimental::device_id_root<
+    //    Kokkos::DefaultExecutionSpace>();
+    std::vector<FencePayload> expected{};
+    expect_fence_events(expected, [=]() {
+      cudaStream_t s1, s2;
+      cudaStreamCreate(&s1);
+      cudaStreamCreate(&s2);
+      Kokkos::Cuda default_space;
+      Kokkos::Cuda space_s1(s1);
+      Kokkos::Cuda space_s2(s2);
+      default_space.fence();
+      space_s1.fence();
+      space_s2.fence();
+    });
+    num_instances += increment<Kokkos::DefaultExecutionSpace>::size;
+    found_payloads.erase(
+        std::remove_if(found_payloads.begin(), found_payloads.end(),
+                       [&](const auto& entry) {
+                         return (
+                             entry.name.find("Fence on space initialization") !=
+                             std::string::npos);
+                       }),
+        found_payloads.end());
+    ASSERT_TRUE(found_payloads[0].dev_id != found_payloads[1].dev_id);
+    ASSERT_TRUE(found_payloads[2].dev_id != found_payloads[1].dev_id);
+    ASSERT_TRUE(found_payloads[2].dev_id != found_payloads[0].dev_id);
+  });
+}
+
+#endif
+
+}  // namespace Test
diff --git a/packages/kokkos/example/query_device/query_device.cpp b/packages/kokkos/example/query_device/query_device.cpp
index a563b06b2864d5d0e855a80b836f3ef70f33f3a1..9c4e9a8c835938c0b301fa1927a2cb5d08e654c1 100644
--- a/packages/kokkos/example/query_device/query_device.cpp
+++ b/packages/kokkos/example/query_device/query_device.cpp
@@ -47,7 +47,8 @@
 
 #include <Kokkos_Macros.hpp>
 
-#if defined(KOKKOS_ENABLE_MPI)
+//#define USE_MPI
+#if defined(USE_MPI)
 #include <mpi.h>
 #endif
 
@@ -61,7 +62,7 @@ int main(int argc, char** argv) {
 
   (void)argc;
   (void)argv;
-#if defined(KOKKOS_ENABLE_MPI)
+#if defined(USE_MPI)
 
   MPI_Init(&argc, &argv);
 
@@ -72,7 +73,7 @@ int main(int argc, char** argv) {
   msg << "MPI rank(" << mpi_rank << ") ";
 
 #endif
-
+  Kokkos::initialize(argc, argv);
   msg << "{" << std::endl;
 
   if (Kokkos::hwloc::available()) {
@@ -82,15 +83,13 @@ int main(int argc, char** argv) {
         << std::endl;
   }
 
-#if defined(KOKKOS_ENABLE_CUDA)
-  Kokkos::Cuda::print_configuration(msg);
-#endif
+  Kokkos::print_configuration(msg);
 
   msg << "}" << std::endl;
 
   std::cout << msg.str();
-
-#if defined(KOKKOS_ENABLE_MPI)
+  Kokkos::finalize();
+#if defined(USE_MPI)
 
   MPI_Finalize();
 
diff --git a/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp b/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp
index 07b99087d4c310e6cf0d82c026f52bd610dd0ecb..5ac7f4fbb060ae952a0685313ec357ffa05abf96 100644
--- a/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp
+++ b/packages/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp
@@ -107,8 +107,8 @@ int main(int argc, char* argv[]) {
 
   // ViewType aliases for Rank<2>, Rank<3> for example usage
   using ScalarType  = double;
-  using ViewType_2D = typename Kokkos::View<ScalarType**>;
-  using ViewType_3D = typename Kokkos::View<ScalarType***>;
+  using ViewType_2D = Kokkos::View<ScalarType**>;
+  using ViewType_3D = Kokkos::View<ScalarType***>;
 
   /////////////////////////////////////////////////////////////////////////////
   // Explanation of MDRangePolicy usage, template parameters, constructor
@@ -160,8 +160,7 @@ int main(int argc, char* argv[]) {
   long incorrect_count_2d = 0;
   {
     // Rank<2> Case: Rank is provided, all other parameters are default
-    using MDPolicyType_2D = typename Kokkos::Experimental::MDRangePolicy<
-        Kokkos::Experimental::Rank<2> >;
+    using MDPolicyType_2D = Kokkos::MDRangePolicy<Kokkos::Rank<2> >;
 
     // Construct 2D MDRangePolicy: lower and upper bounds provided, tile dims
     // defaulted
@@ -185,9 +184,8 @@ int main(int argc, char* argv[]) {
   long incorrect_count_3d = 0;
   {
     // Rank<3> Case: Rank, inner iterate pattern, outer iterate pattern provided
-    using MDPolicyType_3D = typename Kokkos::Experimental::MDRangePolicy<
-        Kokkos::Experimental::Rank<3, Kokkos::Experimental::Iterate::Left,
-                                   Kokkos::Experimental::Iterate::Left> >;
+    using MDPolicyType_3D = Kokkos::MDRangePolicy<
+        Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left> >;
 
     // Construct 3D MDRangePolicy: lower, upper bounds, tile dims provided
     MDPolicyType_3D mdpolicy_3d({{0, 0, 0}}, {{n, n, n}}, {{4, 4, 4}});
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp b/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp
index 597d1e3056ece9ef5865a3fb79dfef09ccf50a6a..75eca5403fd12ae09f2839ef696fafefa9f8f277 100644
--- a/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp
+++ b/packages/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdio>
 
 // These two View types are both 2-D arrays of double.  However, they
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp b/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp
index 00bfeea36b972e6ea08ab8c82ec5aaca1a4e2af5..0544e572e7e9785369bfc824db783ea2fcd5af53 100644
--- a/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp
+++ b/packages/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdio>
 #include <cstdlib>
 
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp b/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp
index 20e5c5a284f415e7627fd07df20ffbe5856f3428..52af4bd3b5ba84b3b5c1b53111900a9104e41922 100644
--- a/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp
+++ b/packages/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp
@@ -49,7 +49,7 @@
 // the mesh.
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdio>
 
 using mesh_type = Kokkos::View<double***, Kokkos::LayoutRight>;
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp b/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp
index 3c0fcd085c7c2afe29a328dfa3f574ab9ac81276..622b24b93131094ebd3c331d4c4f01ae14cca325 100644
--- a/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp
+++ b/packages/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp
@@ -44,7 +44,7 @@
 
 #include <Kokkos_Core.hpp>
 #include <Kokkos_DualView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdio>
 #include <cstdlib>
 
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp b/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp
index a906ba1447283f3a5b2517e1f6c21839b458b597..596b25aaade065c9ade57f90107e17d6fda3d06a 100644
--- a/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp
+++ b/packages/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp
@@ -44,7 +44,7 @@
 
 #include <Kokkos_Core.hpp>
 #include <Kokkos_DualView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdio>
 #include <cstdlib>
 
diff --git a/packages/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp b/packages/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp
index c582fa17043629bd65b253e6afabd76134f1817b..c03515479d0d7e0365272f87cbe49d11b21f13aa 100644
--- a/packages/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp
+++ b/packages/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp
@@ -46,7 +46,7 @@
 #include <cstdio>
 #include <typeinfo>
 #include <cmath>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 struct FillDevice {
   double value;
diff --git a/packages/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp b/packages/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp
index 9c5f2d62fc58b86cbdd723e3328cf0ba1e38df27..602122b61f1b94788b95052478a6123ae41fbfd9 100644
--- a/packages/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp
+++ b/packages/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp
@@ -45,7 +45,7 @@
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Random.hpp>
 #include <Kokkos_DualView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdlib>
 
 using DefaultHostType = Kokkos::HostSpace::execution_space;
@@ -74,7 +74,7 @@ using DefaultHostType = Kokkos::HostSpace::execution_space;
 template <class GeneratorPool>
 struct generate_random {
   // Output View for the random numbers
-  Kokkos::View<uint64_t*> vals;
+  Kokkos::View<uint64_t**> vals;
 
   // The GeneratorPool
   GeneratorPool rand_pool;
@@ -82,7 +82,7 @@ struct generate_random {
   int samples;
 
   // Initialize all members
-  generate_random(Kokkos::View<uint64_t*> vals_, GeneratorPool rand_pool_,
+  generate_random(Kokkos::View<uint64_t**> vals_, GeneratorPool rand_pool_,
                   int samples_)
       : vals(vals_), rand_pool(rand_pool_), samples(samples_) {}
 
@@ -94,8 +94,7 @@ struct generate_random {
     // Draw samples numbers from the pool as urand64 between 0 and
     // rand_pool.MAX_URAND64 Note there are function calls to get other type of
     // scalars, and also to specify Ranges or get a normal distributed float.
-    for (int k = 0; k < samples; k++)
-      vals(i * samples + k) = rand_gen.urand64();
+    for (int k = 0; k < samples; k++) vals(i, k) = rand_gen.urand64();
 
     // Give the state back, which will allow another thread to acquire it
     rand_pool.free_state(rand_gen);
@@ -103,11 +102,11 @@ struct generate_random {
 };
 
 int main(int argc, char* args[]) {
+  Kokkos::initialize(argc, args);
   if (argc != 3) {
     printf("Please pass two integers on the command line\n");
   } else {
     // Initialize Kokkos
-    Kokkos::initialize(argc, args);
     int size    = std::stoi(args[1]);
     int samples = std::stoi(args[2]);
 
@@ -117,7 +116,7 @@ int main(int argc, char* args[]) {
     // pool.
     Kokkos::Random_XorShift64_Pool<> rand_pool64(5374857);
     Kokkos::Random_XorShift1024_Pool<> rand_pool1024(5374857);
-    Kokkos::DualView<uint64_t*> vals("Vals", size * samples);
+    Kokkos::DualView<uint64_t**> vals("Vals", size, samples);
 
     // Run some performance comparisons
     Kokkos::Timer timer;
@@ -151,8 +150,7 @@ int main(int argc, char* args[]) {
            1.0e-9 * samples * size / time_1024);
 
     Kokkos::deep_copy(vals.h_view, vals.d_view);
-
-    Kokkos::finalize();
   }
+  Kokkos::finalize();
   return 0;
 }
diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp
index d36010892597bbcc9d1be710cae06574e7410ba7..cc20a497b2325825cf5faf01e2fd527e71efee53 100644
--- a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp
+++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp
@@ -44,7 +44,7 @@
 
 #include <Kokkos_Core.hpp>
 #include <Kokkos_DualView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdio>
 #include <cstdlib>
 
diff --git a/packages/kokkos/generate_makefile.bash b/packages/kokkos/generate_makefile.bash
index c601e0ee161fb11833c9f97014205585839a3717..5e33f592183b9da2a7f079a09feeab6943bceebf 100755
--- a/packages/kokkos/generate_makefile.bash
+++ b/packages/kokkos/generate_makefile.bash
@@ -162,6 +162,7 @@ display_help_text() {
       echo "                 VEGA900         = AMD GPU MI25 GFX900"
       echo "                 VEGA906         = AMD GPU MI50/MI60 GFX906"
       echo "                 VEGA908         = AMD GPU MI100 GFX908"
+      echo "                 VEGA90A         = "
       echo "               [ARM]"
       echo "                 ARMV80          = ARMv8.0 Compatible CPU"
       echo "                 ARMV81          = ARMv8.1 Compatible CPU"
@@ -478,5 +479,5 @@ if [[ ${COMPILER} == *clang* ]]; then
    fi
 fi
 
-echo cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${KOKKOS_PATH}
-cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${PASSTHRU_CMAKE_FLAGS} ${KOKKOS_PATH}
+echo cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF ${KOKKOS_PATH}
+cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${PASSTHRU_CMAKE_FLAGS} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF ${KOKKOS_PATH}
diff --git a/packages/kokkos/master_history.txt b/packages/kokkos/master_history.txt
index be8a5e7da5f4d8cada30fea4f78a21656268c8ef..69cd133b44b013145a3a4cfa4bb8c0124ef68d73 100644
--- a/packages/kokkos/master_history.txt
+++ b/packages/kokkos/master_history.txt
@@ -25,3 +25,4 @@ tag:  3.3.00     date: 12:16:2020    master: 734f577a    release: 1535ba5c
 tag:  3.3.01     date: 01:06:2021    master: 6d65b5a3    release: 4d23839c
 tag:  3.4.00     date: 04:26:2021    master: 1fb0c284    release: 5d7738d6
 tag:  3.4.01     date: 05:20:2021    master: 4b97a22f    release: 410b15c8
+tag:  3.5.00     date: 10:28:2021    master: c28a8b03    release: ddad6256
diff --git a/packages/kokkos/scripts/docker/Dockerfile.clang b/packages/kokkos/scripts/docker/Dockerfile.clang
index 6aaf75fae55ff975df5045bb73a0813236871d89..92999a8a44a54c22a40e717a5858a9d0dc5b7199 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.clang
+++ b/packages/kokkos/scripts/docker/Dockerfile.clang
@@ -9,16 +9,22 @@ RUN apt-get update && apt-get install -y \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
+RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \
+    KEYDUMP_FILE=keydump && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \
+    gpg --import ${KEYDUMP_FILE} && \
+    gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \
+    rm ${KEYDUMP_FILE}*
+
 ARG CMAKE_VERSION=3.16.8
 ENV CMAKE_DIR=/opt/cmake
-RUN CMAKE_KEY=2D2CEF1034921684 && \
-    CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
+RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
     CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \
     CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \
-    gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \
     gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \
     grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \
     mkdir -p ${CMAKE_DIR} && \
@@ -28,13 +34,11 @@ ENV PATH=${CMAKE_DIR}/bin:$PATH
 
 ENV LLVM_DIR=/opt/llvm
 RUN LLVM_VERSION=8.0.0 && \
-    LLVM_KEY=345AD05D && \
     LLVM_URL=http://releases.llvm.org/${LLVM_VERSION}/clang+llvm-${LLVM_VERSION}-x86_64-linux-gnu-ubuntu-16.04.tar.xz && \
     LLVM_ARCHIVE=llvm-${LLVM_VERSION}.tar.xz && \
     SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \
     wget --quiet ${LLVM_URL} --output-document=${LLVM_ARCHIVE} && \
     wget --quiet ${LLVM_URL}.sig --output-document=${LLVM_ARCHIVE}.sig && \
-    gpg --keyserver pool.sks-keyservers.net --recv-keys ${LLVM_KEY} && \
     gpg --verify ${LLVM_ARCHIVE}.sig ${LLVM_ARCHIVE} && \
     mkdir -p ${LLVM_DIR} && \
     tar -xvf ${LLVM_ARCHIVE} -C ${LLVM_DIR} --strip-components=1 && \
diff --git a/packages/kokkos/scripts/docker/Dockerfile.gcc b/packages/kokkos/scripts/docker/Dockerfile.gcc
index 56972d3185d0f62e6b9effb64e8f2cedefe25c66..51d50e64063b611a79a86c8bea159c6435bdc492 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.gcc
+++ b/packages/kokkos/scripts/docker/Dockerfile.gcc
@@ -1,15 +1,21 @@
 FROM gcc:5.3.0
 
+RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \
+    KEYDUMP_FILE=keydump && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \
+    gpg --import ${KEYDUMP_FILE} && \
+    gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \
+    rm ${KEYDUMP_FILE}*
+
 ARG CMAKE_VERSION=3.16.8
 ENV CMAKE_DIR=/opt/cmake
-RUN CMAKE_KEY=2D2CEF1034921684 && \
-    CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
+RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
     CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \
     CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \
-    gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \
     gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \
     grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \
     mkdir -p ${CMAKE_DIR} && \
diff --git a/packages/kokkos/scripts/docker/Dockerfile.hipcc b/packages/kokkos/scripts/docker/Dockerfile.hipcc
index d3b6b93a023396aa785703a5aeec0c4001af34e8..5bef7f2ef814ad7420b8c0d4bdefa0961f4dd211 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.hipcc
+++ b/packages/kokkos/scripts/docker/Dockerfile.hipcc
@@ -1,8 +1,7 @@
-ARG BASE=rocm/dev-ubuntu-20.04:3.8
+ARG BASE=rocm/dev-ubuntu-20.04:4.2
 FROM $BASE
 
 RUN apt-get update && apt-get install -y \
-        git \
         kmod \
         wget \
         ccache \
@@ -13,16 +12,22 @@ RUN apt-get update && apt-get install -y \
 
 ENV PATH=/opt/rocm/bin:$PATH
 
+RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \
+    KEYDUMP_FILE=keydump && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \
+    gpg --import ${KEYDUMP_FILE} && \
+    gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \
+    rm ${KEYDUMP_FILE}*
+
 ARG CMAKE_VERSION=3.16.8
 ENV CMAKE_DIR=/opt/cmake
-RUN CMAKE_KEY=2D2CEF1034921684 && \
-    CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
+RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
     CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \
     CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \
-    gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \
     gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \
     grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \
     mkdir -p ${CMAKE_DIR} && \
diff --git a/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject b/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject
index 5d53a645e4bc7c551698719d3edb1c3768467ca7..3de9a7f5804f938d3c4056c723f9a25ceb189242 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject
+++ b/packages/kokkos/scripts/docker/Dockerfile.kokkosllvmproject
@@ -11,16 +11,22 @@ RUN apt-get update && apt-get install -y \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
+RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \
+    KEYDUMP_FILE=keydump && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \
+    gpg --import ${KEYDUMP_FILE} && \
+    gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \
+    rm ${KEYDUMP_FILE}*
+
 ARG CMAKE_VERSION=3.16.8
 ENV CMAKE_DIR=/opt/cmake
-RUN CMAKE_KEY=2D2CEF1034921684 && \
-    CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
+RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
     CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \
     CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \
-    gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \
     gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \
     grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \
     mkdir -p ${CMAKE_DIR} && \
diff --git a/packages/kokkos/scripts/docker/Dockerfile.nvcc b/packages/kokkos/scripts/docker/Dockerfile.nvcc
index e17accc0663980694821b8002b976277fcd9ca42..8a054066bde8e1983c9b9baf511836c88eabefa5 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.nvcc
+++ b/packages/kokkos/scripts/docker/Dockerfile.nvcc
@@ -5,7 +5,6 @@ ARG ADDITIONAL_PACKAGES
 
 RUN apt-get update && apt-get install -y \
         bc \
-        git \
         wget \
         ccache \
         $ADDITIONAL_PACKAGES \
@@ -13,16 +12,22 @@ RUN apt-get update && apt-get install -y \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
+RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \
+    KEYDUMP_FILE=keydump && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \
+    gpg --import ${KEYDUMP_FILE} && \
+    gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \
+    rm ${KEYDUMP_FILE}*
+
 ARG CMAKE_VERSION=3.16.8
 ENV CMAKE_DIR=/opt/cmake
-RUN CMAKE_KEY=2D2CEF1034921684 && \
-    CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
+RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
     CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \
     CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \
-    gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \
     gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \
     grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \
     mkdir -p ${CMAKE_DIR} && \
diff --git a/packages/kokkos/scripts/docker/Dockerfile.openmptarget b/packages/kokkos/scripts/docker/Dockerfile.openmptarget
index b6efcb82cae1a8da1cf82e050bf4ad7b8a7870e4..5a676ca32a484dec5ee6e89ca3b4acf21acbbd81 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.openmptarget
+++ b/packages/kokkos/scripts/docker/Dockerfile.openmptarget
@@ -14,16 +14,22 @@ RUN apt-get update && apt-get install -y \
 
 ARG NPROC=8
 
+RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \
+    KEYDUMP_FILE=keydump && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \
+    gpg --import ${KEYDUMP_FILE} && \
+    gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \
+    rm ${KEYDUMP_FILE}*
+
 ARG CMAKE_VERSION=3.18.5
 ENV CMAKE_DIR=/opt/cmake
-RUN CMAKE_KEY=2D2CEF1034921684 && \
-    CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
+RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
     CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \
     CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \
-    gpg --keyserver hkps.pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \
     gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \
     grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \
     mkdir -p ${CMAKE_DIR} && \
diff --git a/packages/kokkos/scripts/docker/Dockerfile.sycl b/packages/kokkos/scripts/docker/Dockerfile.sycl
index fdcd6d01fb8e3158000aa1507bb5bfcf7e0d9b4e..3393d0da8a7f257f71d79eb5dfd9f76b5bfd6a31 100644
--- a/packages/kokkos/scripts/docker/Dockerfile.sycl
+++ b/packages/kokkos/scripts/docker/Dockerfile.sycl
@@ -3,25 +3,31 @@ FROM $BASE
 
 RUN apt-get update && apt-get install -y \
         bc \
-        git \
         wget \
         ccache \
         ninja-build \
         python3 \
+        git \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
+RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \
+    KEYDUMP_FILE=keydump && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \
+    wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \
+    gpg --import ${KEYDUMP_FILE} && \
+    gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \
+    rm ${KEYDUMP_FILE}*
+
 ARG CMAKE_VERSION=3.18.5
 ENV CMAKE_DIR=/opt/cmake
-RUN CMAKE_KEY=2D2CEF1034921684 && \
-    CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
+RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \
     CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \
     CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \
     wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \
-    gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \
     gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \
     grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \
     mkdir -p ${CMAKE_DIR} && \
@@ -30,7 +36,7 @@ RUN CMAKE_KEY=2D2CEF1034921684 && \
 ENV PATH=${CMAKE_DIR}/bin:$PATH
 
 ENV SYCL_DIR=/opt/sycl
-RUN SYCL_VERSION=20210311 && \
+RUN SYCL_VERSION=20210621 && \
     SYCL_URL=https://github.com/intel/llvm/archive/sycl-nightly && \
     SYCL_ARCHIVE=${SYCL_VERSION}.tar.gz && \
     SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \
diff --git a/packages/kokkos/scripts/testing_scripts/test_all_sandia b/packages/kokkos/scripts/testing_scripts/test_all_sandia
index 877b35b73e1aef7c64cdb2d7e5f00f7bc235781c..3e0295643e48b8af85a8aa39874545f7340b157b 100755
--- a/packages/kokkos/scripts/testing_scripts/test_all_sandia
+++ b/packages/kokkos/scripts/testing_scripts/test_all_sandia
@@ -108,6 +108,10 @@ if [[ "$HOSTNAME" == cn* ]]; then # Warning: very generic name
   MACHINE=mayer
 fi
 
+if [[ "$HOSTNAME" == caraway* ]]; then
+  MACHINE=caraway
+fi
+
 if [[ "$HOSTNAME" == kokkos-dev\.sandia\.gov* ]]; then
   MACHINE=kokkos-dev
 fi
@@ -302,6 +306,7 @@ if [ "$MACHINE" = "sems" ]; then
                "clang/5.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
                "clang/7.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
                "clang/9.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/10.0.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
                "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "intel/18.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "intel/19.0.5 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
@@ -430,8 +435,8 @@ elif [ "$MACHINE" = "weaver" ]; then
 
   BASE_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>"
   IBM_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/xl/<COMPILER_VERSION>,gcc/7.2.0"
-  CUDA_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.2.0,ibm/xl/16.1.1"
-  CUDA10_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>,gcc/7.4.0,ibm/xl/16.1.1"
+  CUDA_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>,ibm/xl/16.1.1,gcc/7.2.0"
+  CUDA10_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>,ibm/xl/16.1.1,gcc/7.4.0"
 
   # Don't do pthread with Power
   GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
@@ -494,6 +499,23 @@ elif [ "$MACHINE" = "mayer" ]; then
     ARCH_FLAG="--arch=ARMV8_THUNDERX2"
   fi
 
+elif [ "$MACHINE" = "caraway" ]; then
+  SKIP_HWLOC=True
+
+  BASE_MODULE_LIST="cmake/3.19.3,<COMPILER_NAME>/<COMPILER_VERSION>"
+
+  HIPCLANG_BUILD_LIST="Hip_Serial,Hip_OpenMP"
+  HIPCLANG_WARNING_FLAGS="-Werror -Wno-unused-command-line-argument -DNDEBUG"
+
+  # Format: (compiler module-list build-list exe-name warning-flag)
+  COMPILERS=("rocm/4.2.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS"
+             "rocm/4.3.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS"
+  )
+
+  if [ -z "$ARCH_FLAG" ]; then
+    ARCH_FLAG="--arch=VEGA906"
+  fi
+
 elif [ "$MACHINE" = "blake" ]; then
   source /etc/profile.d/modules.sh
   SKIP_HWLOC=True
@@ -597,8 +619,9 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then
 
   BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>"
   GCC91_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>"
-  NVCC9_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0"
+  NVCC9_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0"
   NVCC_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.3.0"
+  NVCC_SEMSMODULE_LIST="sems-env,sems-cmake/3.17.1,sems-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/7.3.0"
   NVCC11_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/9.2.0"
 
   CLANG8_MODULE_LIST="sems-env,sems-cmake/3.17.1,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/10.0"
@@ -620,13 +643,16 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then
   else
     # Format: (compiler module-list build-list exe-name warning-flag)
     COMPILERS=("cuda/10.0 $NVCC_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-               "cuda/10.1 $NVCC_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/10.1 $NVCC_SEMSMODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/11.0 $NVCC11_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/11.1 $NVCC_SEMSMODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/11.2 $NVCC11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/9.2 $NVCC9_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "clang/8.0 $CLANG8_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
                "clang/8.0 $CLANG8_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS"
                "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/7.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/9.1 $GCC91_MODULE_LIST "$GCC_BUILD_LIST" g++ $GCC_WARNING_FLAGS"
diff --git a/packages/kokkos/scripts/testing_scripts/update_lib.sh b/packages/kokkos/scripts/testing_scripts/update_lib.sh
index 34ab5dd3c9a0afae4b10b70d99772308f35b3f9f..ee2f66dc407a76dd7f70c6c14b789fd42584fb11 100755
--- a/packages/kokkos/scripts/testing_scripts/update_lib.sh
+++ b/packages/kokkos/scripts/testing_scripts/update_lib.sh
@@ -20,7 +20,7 @@ check_sems_clang() {
   CLANGVER=$(clang --version | grep "clang version" | cut -d " " -f 3)
   if [[ "${CLANGVER}" = 9.* ]] || [[ "${CLANGVER}" = 10.* ]]; then
     # Newer gcc needed for c++ standard beyond c++14
-    module swap sems-gcc/5.3.0 sems-gcc/6.4.0
+    module swap sems-gcc/5.3.0 sems-gcc/8.3.0
     module list
   fi
 }